/* sm3_asm.S */
/*
 * Copyright (C) 2006-2025 wolfSSL Inc.
 *
 * This file is part of wolfSSL.
 *
 * wolfSSL is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * wolfSSL is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
 */

#ifdef WOLFSSL_USER_SETTINGS
#ifdef WOLFSSL_USER_SETTINGS_ASM
/*
 * user_settings_asm.h is a file generated by the script user_settings_asm.sh.
 * The script takes in a user_settings.h and produces user_settings_asm.h, which
 * is a stripped down version of user_settings.h containing only preprocessor
 * directives. This makes the header safe to include in assembly (.S) files.
 */
#include "user_settings_asm.h"
#else
/*
 * Note: if user_settings.h contains any C code (e.g. a typedef or function
 * prototype), including it here in an assembly (.S) file will cause an
 * assembler failure. See user_settings_asm.h above.
 */
#include "user_settings.h"
#endif /* WOLFSSL_USER_SETTINGS_ASM */
#endif /* WOLFSSL_USER_SETTINGS */

#ifndef HAVE_INTEL_AVX1
#define HAVE_INTEL_AVX1
#endif /* HAVE_INTEL_AVX1 */
#ifndef NO_AVX2_SUPPORT
#ifndef HAVE_INTEL_AVX2
#define HAVE_INTEL_AVX2
#endif /* HAVE_INTEL_AVX2 */
#endif /* NO_AVX2_SUPPORT */

#ifdef WOLFSSL_SM3
#ifdef WOLFSSL_X86_64_BUILD
#ifdef HAVE_INTEL_AVX1
#ifndef __APPLE__
.data
#else
.section	__DATA,__data
#endif /* __APPLE__ */
L_SM3_AVX1_t:
.long	0x79cc4519,0xf3988a32,0xe7311465,0xce6228cb
.long	0x9cc45197,0x3988a32f,0x7311465e,0xe6228cbc
.long	0xcc451979,0x988a32f3,0x311465e7,0x6228cbce
.long	0xc451979c,0x88a32f39,0x11465e73,0x228cbce6
.long	0x9d8a7a87,0x3b14f50f,0x7629ea1e,0xec53d43c
.long	0xd8a7a879,0xb14f50f3,0x629ea1e7,0xc53d43ce
.long	0x8a7a879d,0x14f50f3b,0x29ea1e76,0x53d43cec
.long	0xa7a879d8,0x4f50f3b1,0x9ea1e762,0x3d43cec5
.long	0x7a879d8a,0xf50f3b14,0xea1e7629,0xd43cec53
.long	0xa879d8a7,0x50f3b14f,0xa1e7629e,0x43cec53d
.long	0x879d8a7a,0xf3b14f5,0x1e7629ea,0x3cec53d4
.long	0x79d8a7a8,0xf3b14f50,0xe7629ea1,0xcec53d43
.long	0x9d8a7a87,0x3b14f50f,0x7629ea1e,0xec53d43c
.long	0xd8a7a879,0xb14f50f3,0x629ea1e7,0xc53d43ce
.long	0x8a7a879d,0x14f50f3b,0x29ea1e76,0x53d43cec
.long	0xa7a879d8,0x4f50f3b1,0x9ea1e762,0x3d43cec5
#ifndef __APPLE__
.data
#else
.section	__DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align	16
#else
.p2align	4
#endif /* __APPLE__ */
L_SM3_AVX1_flip_mask:
.quad	0x405060700010203, 0xc0d0e0f08090a0b
#ifndef __APPLE__
.text
.globl	sm3_compress_avx1
.type	sm3_compress_avx1,@function
.align	16
sm3_compress_avx1:
#else
.section	__TEXT,__text
.globl	_sm3_compress_avx1
.p2align	4
_sm3_compress_avx1:
#endif /* __APPLE__ */
        pushq	%rbx
        pushq	%r12
        pushq	%r13
        pushq	%r14
        pushq	%r15
        subq	$0x110, %rsp
        leaq	32(%rdi), %rax
        vmovdqa	L_SM3_AVX1_flip_mask(%rip), %xmm11
        movl	(%rdi), %r8d
        movl	4(%rdi), %r9d
        movl	8(%rdi), %r10d
        movl	12(%rdi), %r11d
        movl	16(%rdi), %r12d
        movl	20(%rdi), %r13d
        movl	24(%rdi), %r14d
        movl	28(%rdi), %r15d
        # X0, X1, X2, X3 = W[0..15]
        vmovdqu	(%rax), %xmm0
        vmovdqu	16(%rax), %xmm1
        vmovdqu	32(%rax), %xmm2
        vmovdqu	48(%rax), %xmm3
        # x_to_w: 0
        vmovdqu	%xmm0, (%rsp)
        vmovdqu	%xmm1, 16(%rsp)
        vmovdqu	%xmm2, 32(%rsp)
        vmovdqu	%xmm3, 48(%rsp)
        # msg_sched: 0-3
        # iter_0: 0 - 0
        movl	0+L_SM3_AVX1_t(%rip), %edx
        movl	%r8d, %ecx
        addl	%r12d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm0, %xmm1, %xmm5
        # iter_0: 1 - 1
        movl	16(%rsp), %eax
        addl	%ecx, %edx
        movl	(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm2, %xmm3, %xmm4
        # iter_0: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_0: 3 - 3
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r8d, %r15d
        movl	%r12d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_0: 4 - 4
        xorl	%r9d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        xorl	%r14d, %r11d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_0: 5 - 5
        addl	%ecx, %r15d
        addl	%edx, %r11d
        movl	%r11d, %ebx
        roll	$8, %r11d
        vpalignr	$12, %xmm1, %xmm2, %xmm10
        # iter_0: 6 - 7
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r11d
        vpxor	%xmm10, %xmm0, %xmm10
        # iter_1: 0 - 0
        movl	4+L_SM3_AVX1_t(%rip), %edx
        movl	%r15d, %ecx
        addl	%r11d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm3, %xmm4
        # iter_1: 1 - 2
        movl	20(%rsp), %eax
        addl	%ecx, %edx
        movl	4(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_1: 3 - 3
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r15d, %r14d
        movl	%r11d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_1: 4 - 4
        xorl	%r8d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        xorl	%r13d, %r10d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_1: 5 - 5
        addl	%ecx, %r14d
        addl	%edx, %r10d
        movl	%r10d, %ebx
        roll	$8, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_1: 6 - 6
        roll	$9, %r8d
        xorl	%ebx, %r10d
        roll	$19, %r12d
        roll	$9, %r10d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_1: 7 - 7
        xorl	%ebx, %r10d
        # iter_2: 0 - 0
        movl	8+L_SM3_AVX1_t(%rip), %edx
        movl	%r14d, %ecx
        addl	%r10d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_2: 1 - 1
        movl	24(%rsp), %eax
        addl	%ecx, %edx
        movl	8(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_2: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_2: 3 - 4
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r14d, %r13d
        movl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        xorl	%r12d, %r9d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_2: 5 - 5
        addl	%ecx, %r13d
        addl	%edx, %r9d
        movl	%r9d, %ebx
        roll	$8, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_2: 6 - 6
        roll	$9, %r15d
        xorl	%ebx, %r9d
        roll	$19, %r11d
        roll	$9, %r9d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_2: 7 - 7
        xorl	%ebx, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_3: 0 - 0
        movl	12+L_SM3_AVX1_t(%rip), %edx
        movl	%r13d, %ecx
        addl	%r9d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_3: 1 - 1
        movl	28(%rsp), %eax
        addl	%ecx, %edx
        movl	12(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_3: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_3: 3 - 3
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r13d, %r12d
        movl	%r9d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_3: 4 - 4
        xorl	%r14d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        xorl	%r11d, %r8d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_3: 5 - 5
        addl	%ecx, %r12d
        addl	%edx, %r8d
        movl	%r8d, %ebx
        roll	$8, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm0
        # iter_3: 6 - 6
        roll	$9, %r14d
        xorl	%ebx, %r8d
        roll	$19, %r10d
        roll	$9, %r8d
        # iter_3: 7 - 7
        xorl	%ebx, %r8d
        # msg_sched done: 0-3
        # msg_sched: 4-7
        # iter_4: 0 - 0
        movl	16+L_SM3_AVX1_t(%rip), %edx
        movl	%r12d, %ecx
        addl	%r8d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm1, %xmm2, %xmm5
        # iter_4: 1 - 1
        movl	32(%rsp), %eax
        addl	%ecx, %edx
        movl	16(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm3, %xmm0, %xmm4
        # iter_4: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_4: 3 - 3
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r12d, %r11d
        movl	%r8d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_4: 4 - 4
        xorl	%r13d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        xorl	%r10d, %r15d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_4: 5 - 5
        addl	%ecx, %r11d
        addl	%edx, %r15d
        movl	%r15d, %ebx
        roll	$8, %r15d
        vpalignr	$12, %xmm2, %xmm3, %xmm10
        # iter_4: 6 - 7
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        roll	$9, %r15d
        xorl	%ebx, %r15d
        vpxor	%xmm10, %xmm1, %xmm10
        # iter_5: 0 - 0
        movl	20+L_SM3_AVX1_t(%rip), %edx
        movl	%r11d, %ecx
        addl	%r15d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm0, %xmm4
        # iter_5: 1 - 2
        movl	36(%rsp), %eax
        addl	%ecx, %edx
        movl	20(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_5: 3 - 3
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r11d, %r10d
        movl	%r15d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_5: 4 - 4
        xorl	%r12d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        xorl	%r9d, %r14d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_5: 5 - 5
        addl	%ecx, %r10d
        addl	%edx, %r14d
        movl	%r14d, %ebx
        roll	$8, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_5: 6 - 6
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        roll	$9, %r14d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_5: 7 - 7
        xorl	%ebx, %r14d
        # iter_6: 0 - 0
        movl	24+L_SM3_AVX1_t(%rip), %edx
        movl	%r10d, %ecx
        addl	%r14d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_6: 1 - 1
        movl	40(%rsp), %eax
        addl	%ecx, %edx
        movl	24(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_6: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_6: 3 - 4
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r10d, %r9d
        movl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        xorl	%r8d, %r13d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_6: 5 - 5
        addl	%ecx, %r9d
        addl	%edx, %r13d
        movl	%r13d, %ebx
        roll	$8, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_6: 6 - 6
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        roll	$9, %r13d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_6: 7 - 7
        xorl	%ebx, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_7: 0 - 0
        movl	28+L_SM3_AVX1_t(%rip), %edx
        movl	%r9d, %ecx
        addl	%r13d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_7: 1 - 1
        movl	44(%rsp), %eax
        addl	%ecx, %edx
        movl	28(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_7: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_7: 3 - 3
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r9d, %r8d
        movl	%r13d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_7: 4 - 4
        xorl	%r10d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        xorl	%r15d, %r12d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_7: 5 - 5
        addl	%ecx, %r8d
        addl	%edx, %r12d
        movl	%r12d, %ebx
        roll	$8, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm1
        # iter_7: 6 - 6
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        roll	$9, %r12d
        # iter_7: 7 - 7
        xorl	%ebx, %r12d
        # msg_sched done: 4-7
        # x2_to_w: 16
        vmovdqu	%xmm0, 64(%rsp)
        vmovdqu	%xmm1, 80(%rsp)
        # msg_sched: 8-11
        # iter_8: 0 - 0
        movl	32+L_SM3_AVX1_t(%rip), %edx
        movl	%r8d, %ecx
        addl	%r12d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm2, %xmm3, %xmm5
        # iter_8: 1 - 1
        movl	48(%rsp), %eax
        addl	%ecx, %edx
        movl	32(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm0, %xmm1, %xmm4
        # iter_8: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_8: 3 - 3
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r8d, %r15d
        movl	%r12d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_8: 4 - 4
        xorl	%r9d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        xorl	%r14d, %r11d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_8: 5 - 5
        addl	%ecx, %r15d
        addl	%edx, %r11d
        movl	%r11d, %ebx
        roll	$8, %r11d
        vpalignr	$12, %xmm3, %xmm0, %xmm10
        # iter_8: 6 - 7
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r11d
        vpxor	%xmm10, %xmm2, %xmm10
        # iter_9: 0 - 0
        movl	36+L_SM3_AVX1_t(%rip), %edx
        movl	%r15d, %ecx
        addl	%r11d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm1, %xmm4
        # iter_9: 1 - 2
        movl	52(%rsp), %eax
        addl	%ecx, %edx
        movl	36(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_9: 3 - 3
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r15d, %r14d
        movl	%r11d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_9: 4 - 4
        xorl	%r8d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        xorl	%r13d, %r10d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_9: 5 - 5
        addl	%ecx, %r14d
        addl	%edx, %r10d
        movl	%r10d, %ebx
        roll	$8, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_9: 6 - 6
        roll	$9, %r8d
        xorl	%ebx, %r10d
        roll	$19, %r12d
        roll	$9, %r10d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_9: 7 - 7
        xorl	%ebx, %r10d
        # iter_10: 0 - 0
        movl	40+L_SM3_AVX1_t(%rip), %edx
        movl	%r14d, %ecx
        addl	%r10d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_10: 1 - 1
        movl	56(%rsp), %eax
        addl	%ecx, %edx
        movl	40(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_10: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_10: 3 - 4
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r14d, %r13d
        movl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        xorl	%r12d, %r9d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_10: 5 - 5
        addl	%ecx, %r13d
        addl	%edx, %r9d
        movl	%r9d, %ebx
        roll	$8, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_10: 6 - 6
        roll	$9, %r15d
        xorl	%ebx, %r9d
        roll	$19, %r11d
        roll	$9, %r9d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_10: 7 - 7
        xorl	%ebx, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_11: 0 - 0
        movl	44+L_SM3_AVX1_t(%rip), %edx
        movl	%r13d, %ecx
        addl	%r9d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_11: 1 - 1
        movl	60(%rsp), %eax
        addl	%ecx, %edx
        movl	44(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_11: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_11: 3 - 3
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r13d, %r12d
        movl	%r9d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_11: 4 - 4
        xorl	%r14d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        xorl	%r11d, %r8d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_11: 5 - 5
        addl	%ecx, %r12d
        addl	%edx, %r8d
        movl	%r8d, %ebx
        roll	$8, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm2
        # iter_11: 6 - 6
        roll	$9, %r14d
        xorl	%ebx, %r8d
        roll	$19, %r10d
        roll	$9, %r8d
        # iter_11: 7 - 7
        xorl	%ebx, %r8d
        # msg_sched done: 8-11
        # msg_sched: 12-15
        # iter_12: 0 - 0
        movl	48+L_SM3_AVX1_t(%rip), %edx
        movl	%r12d, %ecx
        addl	%r8d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm3, %xmm0, %xmm5
        # iter_12: 1 - 1
        movl	64(%rsp), %eax
        addl	%ecx, %edx
        movl	48(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm1, %xmm2, %xmm4
        # iter_12: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_12: 3 - 3
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r12d, %r11d
        movl	%r8d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_12: 4 - 4
        xorl	%r13d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        xorl	%r10d, %r15d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_12: 5 - 5
        addl	%ecx, %r11d
        addl	%edx, %r15d
        movl	%r15d, %ebx
        roll	$8, %r15d
        vpalignr	$12, %xmm0, %xmm1, %xmm10
        # iter_12: 6 - 7
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        roll	$9, %r15d
        xorl	%ebx, %r15d
        vpxor	%xmm10, %xmm3, %xmm10
        # iter_13: 0 - 0
        movl	52+L_SM3_AVX1_t(%rip), %edx
        movl	%r11d, %ecx
        addl	%r15d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm2, %xmm4
        # iter_13: 1 - 2
        movl	68(%rsp), %eax
        addl	%ecx, %edx
        movl	52(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_13: 3 - 3
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r11d, %r10d
        movl	%r15d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_13: 4 - 4
        xorl	%r12d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        xorl	%r9d, %r14d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_13: 5 - 5
        addl	%ecx, %r10d
        addl	%edx, %r14d
        movl	%r14d, %ebx
        roll	$8, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_13: 6 - 6
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        roll	$9, %r14d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_13: 7 - 7
        xorl	%ebx, %r14d
        # iter_14: 0 - 0
        movl	56+L_SM3_AVX1_t(%rip), %edx
        movl	%r10d, %ecx
        addl	%r14d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_14: 1 - 1
        movl	72(%rsp), %eax
        addl	%ecx, %edx
        movl	56(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_14: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_14: 3 - 4
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r10d, %r9d
        movl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        xorl	%r8d, %r13d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_14: 5 - 5
        addl	%ecx, %r9d
        addl	%edx, %r13d
        movl	%r13d, %ebx
        roll	$8, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_14: 6 - 6
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        roll	$9, %r13d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_14: 7 - 7
        xorl	%ebx, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_15: 0 - 0
        movl	60+L_SM3_AVX1_t(%rip), %edx
        movl	%r9d, %ecx
        addl	%r13d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_15: 1 - 1
        movl	76(%rsp), %eax
        addl	%ecx, %edx
        movl	60(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_15: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_15: 3 - 3
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r9d, %r8d
        movl	%r13d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_15: 4 - 4
        xorl	%r10d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        xorl	%r15d, %r12d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_15: 5 - 5
        addl	%ecx, %r8d
        addl	%edx, %r12d
        movl	%r12d, %ebx
        roll	$8, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm3
        # iter_15: 6 - 6
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        roll	$9, %r12d
        # iter_15: 7 - 7
        xorl	%ebx, %r12d
        # msg_sched done: 12-15
        # x2_to_w: 24
        vmovdqu	%xmm2, 96(%rsp)
        vmovdqu	%xmm3, 112(%rsp)
        # msg_sched: 16-19
        # iter_16: 0 - 0
        movl	64+L_SM3_AVX1_t(%rip), %edx
        movl	%r8d, %ecx
        addl	%r12d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm0, %xmm1, %xmm5
        # iter_16: 1 - 1
        movl	80(%rsp), %eax
        addl	%ecx, %edx
        movl	64(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm2, %xmm3, %xmm4
        # iter_16: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_16: 3 - 3
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r13d, %r11d
        movl	%r9d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_16: 4 - 4
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        xorl	%r10d, %ebx
        xorl	%r14d, %r11d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_16: 5 - 5
        andl	%ebx, %r15d
        andl	%r12d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        vpalignr	$12, %xmm1, %xmm2, %xmm10
        # iter_16: 6 - 7
        addl	%ecx, %r15d
        addl	%edx, %r11d
        movl	%r11d, %ebx
        roll	$8, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r11d
        vpxor	%xmm10, %xmm0, %xmm10
        # iter_17: 0 - 0
        movl	68+L_SM3_AVX1_t(%rip), %edx
        movl	%r15d, %ecx
        addl	%r11d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm3, %xmm4
        # iter_17: 1 - 2
        movl	84(%rsp), %eax
        addl	%ecx, %edx
        movl	68(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_17: 3 - 3
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r12d, %r10d
        movl	%r8d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_17: 4 - 4
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        xorl	%r9d, %ebx
        xorl	%r13d, %r10d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_17: 5 - 5
        andl	%ebx, %r14d
        andl	%r11d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_17: 6 - 6
        addl	%ecx, %r14d
        addl	%edx, %r10d
        movl	%r10d, %ebx
        roll	$8, %r10d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_17: 7 - 7
        roll	$9, %r8d
        xorl	%ebx, %r10d
        roll	$19, %r12d
        roll	$9, %r10d
        xorl	%ebx, %r10d
        # iter_18: 0 - 0
        movl	72+L_SM3_AVX1_t(%rip), %edx
        movl	%r14d, %ecx
        addl	%r10d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_18: 1 - 1
        movl	88(%rsp), %eax
        addl	%ecx, %edx
        movl	72(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_18: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_18: 3 - 4
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r11d, %r9d
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        xorl	%r12d, %r9d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_18: 5 - 5
        andl	%ebx, %r13d
        andl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_18: 6 - 6
        addl	%ecx, %r13d
        addl	%edx, %r9d
        movl	%r9d, %ebx
        roll	$8, %r9d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_18: 7 - 7
        roll	$9, %r15d
        xorl	%ebx, %r9d
        roll	$19, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_19: 0 - 0
        movl	76+L_SM3_AVX1_t(%rip), %edx
        movl	%r13d, %ecx
        addl	%r9d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_19: 1 - 1
        movl	92(%rsp), %eax
        addl	%ecx, %edx
        movl	76(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_19: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_19: 3 - 3
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r10d, %r8d
        movl	%r14d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_19: 4 - 4
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        xorl	%r15d, %ebx
        xorl	%r11d, %r8d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_19: 5 - 5
        andl	%ebx, %r12d
        andl	%r9d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm0
        # iter_19: 6 - 6
        addl	%ecx, %r12d
        addl	%edx, %r8d
        movl	%r8d, %ebx
        roll	$8, %r8d
        # iter_19: 7 - 7
        roll	$9, %r14d
        xorl	%ebx, %r8d
        roll	$19, %r10d
        roll	$9, %r8d
        xorl	%ebx, %r8d
        # msg_sched done: 16-19
        # msg_sched: 20-23
        # iter_20: 0 - 0
        movl	80+L_SM3_AVX1_t(%rip), %edx
        movl	%r12d, %ecx
        addl	%r8d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm1, %xmm2, %xmm5
        # iter_20: 1 - 1
        movl	96(%rsp), %eax
        addl	%ecx, %edx
        movl	80(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm3, %xmm0, %xmm4
        # iter_20: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_20: 3 - 3
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r9d, %r15d
        movl	%r13d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_20: 4 - 4
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        xorl	%r14d, %ebx
        xorl	%r10d, %r15d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_20: 5 - 5
        andl	%ebx, %r11d
        andl	%r8d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        vpalignr	$12, %xmm2, %xmm3, %xmm10
        # iter_20: 6 - 7
        addl	%ecx, %r11d
        addl	%edx, %r15d
        movl	%r15d, %ebx
        roll	$8, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        roll	$9, %r15d
        xorl	%ebx, %r15d
        vpxor	%xmm10, %xmm1, %xmm10
        # iter_21: 0 - 0
        movl	84+L_SM3_AVX1_t(%rip), %edx
        movl	%r11d, %ecx
        addl	%r15d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm0, %xmm4
        # iter_21: 1 - 2
        movl	100(%rsp), %eax
        addl	%ecx, %edx
        movl	84(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_21: 3 - 3
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r8d, %r14d
        movl	%r12d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_21: 4 - 4
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        xorl	%r13d, %ebx
        xorl	%r9d, %r14d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_21: 5 - 5
        andl	%ebx, %r10d
        andl	%r15d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_21: 6 - 6
        addl	%ecx, %r10d
        addl	%edx, %r14d
        movl	%r14d, %ebx
        roll	$8, %r14d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_21: 7 - 7
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        roll	$9, %r14d
        xorl	%ebx, %r14d
        # iter_22: 0 - 0
        movl	88+L_SM3_AVX1_t(%rip), %edx
        movl	%r10d, %ecx
        addl	%r14d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_22: 1 - 1
        movl	104(%rsp), %eax
        addl	%ecx, %edx
        movl	88(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_22: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_22: 3 - 4
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r15d, %r13d
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        xorl	%r8d, %r13d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_22: 5 - 5
        andl	%ebx, %r9d
        andl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_22: 6 - 6
        addl	%ecx, %r9d
        addl	%edx, %r13d
        movl	%r13d, %ebx
        roll	$8, %r13d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_22: 7 - 7
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_23: 0 - 0
        movl	92+L_SM3_AVX1_t(%rip), %edx
        movl	%r9d, %ecx
        addl	%r13d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_23: 1 - 1
        movl	108(%rsp), %eax
        addl	%ecx, %edx
        movl	92(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_23: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_23: 3 - 3
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r14d, %r12d
        movl	%r10d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_23: 4 - 4
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        xorl	%r11d, %ebx
        xorl	%r15d, %r12d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_23: 5 - 5
        andl	%ebx, %r8d
        andl	%r13d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm1
        # iter_23: 6 - 6
        addl	%ecx, %r8d
        addl	%edx, %r12d
        movl	%r12d, %ebx
        roll	$8, %r12d
        # iter_23: 7 - 7
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        roll	$9, %r12d
        xorl	%ebx, %r12d
        # msg_sched done: 20-23
        # x2_to_w: 32
        vmovdqu	%xmm0, 128(%rsp)
        vmovdqu	%xmm1, 144(%rsp)
        # msg_sched: 24-27
        # iter_24: 0 - 0
        movl	96+L_SM3_AVX1_t(%rip), %edx
        movl	%r8d, %ecx
        addl	%r12d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm2, %xmm3, %xmm5
        # iter_24: 1 - 1
        movl	112(%rsp), %eax
        addl	%ecx, %edx
        movl	96(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm0, %xmm1, %xmm4
        # iter_24: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_24: 3 - 3
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r13d, %r11d
        movl	%r9d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_24: 4 - 4
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        xorl	%r10d, %ebx
        xorl	%r14d, %r11d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_24: 5 - 5
        andl	%ebx, %r15d
        andl	%r12d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        vpalignr	$12, %xmm3, %xmm0, %xmm10
        # iter_24: 6 - 7
        addl	%ecx, %r15d
        addl	%edx, %r11d
        movl	%r11d, %ebx
        roll	$8, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r11d
        vpxor	%xmm10, %xmm2, %xmm10
        # iter_25: 0 - 0
        movl	100+L_SM3_AVX1_t(%rip), %edx
        movl	%r15d, %ecx
        addl	%r11d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm1, %xmm4
        # iter_25: 1 - 2
        movl	116(%rsp), %eax
        addl	%ecx, %edx
        movl	100(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_25: 3 - 3
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r12d, %r10d
        movl	%r8d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_25: 4 - 4
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        xorl	%r9d, %ebx
        xorl	%r13d, %r10d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_25: 5 - 5
        andl	%ebx, %r14d
        andl	%r11d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_25: 6 - 6
        addl	%ecx, %r14d
        addl	%edx, %r10d
        movl	%r10d, %ebx
        roll	$8, %r10d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_25: 7 - 7
        roll	$9, %r8d
        xorl	%ebx, %r10d
        roll	$19, %r12d
        roll	$9, %r10d
        xorl	%ebx, %r10d
        # iter_26: 0 - 0
        movl	104+L_SM3_AVX1_t(%rip), %edx
        movl	%r14d, %ecx
        addl	%r10d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_26: 1 - 1
        movl	120(%rsp), %eax
        addl	%ecx, %edx
        movl	104(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_26: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_26: 3 - 4
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r11d, %r9d
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        xorl	%r12d, %r9d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_26: 5 - 5
        andl	%ebx, %r13d
        andl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_26: 6 - 6
        addl	%ecx, %r13d
        addl	%edx, %r9d
        movl	%r9d, %ebx
        roll	$8, %r9d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_26: 7 - 7
        roll	$9, %r15d
        xorl	%ebx, %r9d
        roll	$19, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_27: 0 - 0
        movl	108+L_SM3_AVX1_t(%rip), %edx
        movl	%r13d, %ecx
        addl	%r9d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_27: 1 - 1
        movl	124(%rsp), %eax
        addl	%ecx, %edx
        movl	108(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_27: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_27: 3 - 3
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r10d, %r8d
        movl	%r14d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_27: 4 - 4
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        xorl	%r15d, %ebx
        xorl	%r11d, %r8d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_27: 5 - 5
        andl	%ebx, %r12d
        andl	%r9d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm2
        # iter_27: 6 - 6
        addl	%ecx, %r12d
        addl	%edx, %r8d
        movl	%r8d, %ebx
        roll	$8, %r8d
        # iter_27: 7 - 7
        roll	$9, %r14d
        xorl	%ebx, %r8d
        roll	$19, %r10d
        roll	$9, %r8d
        xorl	%ebx, %r8d
        # msg_sched done: 24-27
        # msg_sched: 28-31
        # iter_28: 0 - 0
        movl	112+L_SM3_AVX1_t(%rip), %edx
        movl	%r12d, %ecx
        addl	%r8d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm3, %xmm0, %xmm5
        # iter_28: 1 - 1
        movl	128(%rsp), %eax
        addl	%ecx, %edx
        movl	112(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm1, %xmm2, %xmm4
        # iter_28: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_28: 3 - 3
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r9d, %r15d
        movl	%r13d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_28: 4 - 4
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        xorl	%r14d, %ebx
        xorl	%r10d, %r15d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_28: 5 - 5
        andl	%ebx, %r11d
        andl	%r8d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        vpalignr	$12, %xmm0, %xmm1, %xmm10
        # iter_28: 6 - 7
        addl	%ecx, %r11d
        addl	%edx, %r15d
        movl	%r15d, %ebx
        roll	$8, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        roll	$9, %r15d
        xorl	%ebx, %r15d
        vpxor	%xmm10, %xmm3, %xmm10
        # iter_29: 0 - 0
        movl	116+L_SM3_AVX1_t(%rip), %edx
        movl	%r11d, %ecx
        addl	%r15d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm2, %xmm4
        # iter_29: 1 - 2
        movl	132(%rsp), %eax
        addl	%ecx, %edx
        movl	116(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_29: 3 - 3
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r8d, %r14d
        movl	%r12d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_29: 4 - 4
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        xorl	%r13d, %ebx
        xorl	%r9d, %r14d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_29: 5 - 5
        andl	%ebx, %r10d
        andl	%r15d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_29: 6 - 6
        addl	%ecx, %r10d
        addl	%edx, %r14d
        movl	%r14d, %ebx
        roll	$8, %r14d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_29: 7 - 7
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        roll	$9, %r14d
        xorl	%ebx, %r14d
        # iter_30: 0 - 0
        movl	120+L_SM3_AVX1_t(%rip), %edx
        movl	%r10d, %ecx
        addl	%r14d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_30: 1 - 1
        movl	136(%rsp), %eax
        addl	%ecx, %edx
        movl	120(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_30: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_30: 3 - 4
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r15d, %r13d
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        xorl	%r8d, %r13d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_30: 5 - 5
        andl	%ebx, %r9d
        andl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_30: 6 - 6
        addl	%ecx, %r9d
        addl	%edx, %r13d
        movl	%r13d, %ebx
        roll	$8, %r13d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_30: 7 - 7
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_31: 0 - 0
        movl	124+L_SM3_AVX1_t(%rip), %edx
        movl	%r9d, %ecx
        addl	%r13d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_31: 1 - 1
        movl	140(%rsp), %eax
        addl	%ecx, %edx
        movl	124(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_31: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_31: 3 - 3
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r14d, %r12d
        movl	%r10d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_31: 4 - 4
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        xorl	%r11d, %ebx
        xorl	%r15d, %r12d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_31: 5 - 5
        andl	%ebx, %r8d
        andl	%r13d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm3
        # iter_31: 6 - 6
        addl	%ecx, %r8d
        addl	%edx, %r12d
        movl	%r12d, %ebx
        roll	$8, %r12d
        # iter_31: 7 - 7
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        roll	$9, %r12d
        xorl	%ebx, %r12d
        # msg_sched done: 28-31
        # x2_to_w: 40
        vmovdqu	%xmm2, 160(%rsp)
        vmovdqu	%xmm3, 176(%rsp)
        # msg_sched: 32-35
        # iter_32: 0 - 0
        movl	128+L_SM3_AVX1_t(%rip), %edx
        movl	%r8d, %ecx
        addl	%r12d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm0, %xmm1, %xmm5
        # iter_32: 1 - 1
        movl	144(%rsp), %eax
        addl	%ecx, %edx
        movl	128(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm2, %xmm3, %xmm4
        # iter_32: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_32: 3 - 3
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r13d, %r11d
        movl	%r9d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_32: 4 - 4
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        xorl	%r10d, %ebx
        xorl	%r14d, %r11d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_32: 5 - 5
        andl	%ebx, %r15d
        andl	%r12d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        vpalignr	$12, %xmm1, %xmm2, %xmm10
        # iter_32: 6 - 7
        addl	%ecx, %r15d
        addl	%edx, %r11d
        movl	%r11d, %ebx
        roll	$8, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r11d
        vpxor	%xmm10, %xmm0, %xmm10
        # iter_33: 0 - 0
        movl	132+L_SM3_AVX1_t(%rip), %edx
        movl	%r15d, %ecx
        addl	%r11d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm3, %xmm4
        # iter_33: 1 - 2
        movl	148(%rsp), %eax
        addl	%ecx, %edx
        movl	132(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_33: 3 - 3
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r12d, %r10d
        movl	%r8d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_33: 4 - 4
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        xorl	%r9d, %ebx
        xorl	%r13d, %r10d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_33: 5 - 5
        andl	%ebx, %r14d
        andl	%r11d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_33: 6 - 6
        addl	%ecx, %r14d
        addl	%edx, %r10d
        movl	%r10d, %ebx
        roll	$8, %r10d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_33: 7 - 7
        roll	$9, %r8d
        xorl	%ebx, %r10d
        roll	$19, %r12d
        roll	$9, %r10d
        xorl	%ebx, %r10d
        # iter_34: 0 - 0
        movl	136+L_SM3_AVX1_t(%rip), %edx
        movl	%r14d, %ecx
        addl	%r10d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_34: 1 - 1
        movl	152(%rsp), %eax
        addl	%ecx, %edx
        movl	136(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_34: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_34: 3 - 4
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r11d, %r9d
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        xorl	%r12d, %r9d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_34: 5 - 5
        andl	%ebx, %r13d
        andl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_34: 6 - 6
        addl	%ecx, %r13d
        addl	%edx, %r9d
        movl	%r9d, %ebx
        roll	$8, %r9d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_34: 7 - 7
        roll	$9, %r15d
        xorl	%ebx, %r9d
        roll	$19, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_35: 0 - 0
        movl	140+L_SM3_AVX1_t(%rip), %edx
        movl	%r13d, %ecx
        addl	%r9d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_35: 1 - 1
        movl	156(%rsp), %eax
        addl	%ecx, %edx
        movl	140(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_35: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_35: 3 - 3
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r10d, %r8d
        movl	%r14d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_35: 4 - 4
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        xorl	%r15d, %ebx
        xorl	%r11d, %r8d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_35: 5 - 5
        andl	%ebx, %r12d
        andl	%r9d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm0
        # iter_35: 6 - 6
        addl	%ecx, %r12d
        addl	%edx, %r8d
        movl	%r8d, %ebx
        roll	$8, %r8d
        # iter_35: 7 - 7
        roll	$9, %r14d
        xorl	%ebx, %r8d
        roll	$19, %r10d
        roll	$9, %r8d
        xorl	%ebx, %r8d
        # msg_sched done: 32-35
        # msg_sched: 36-39
        # iter_36: 0 - 0
        movl	144+L_SM3_AVX1_t(%rip), %edx
        movl	%r12d, %ecx
        addl	%r8d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm1, %xmm2, %xmm5
        # iter_36: 1 - 1
        movl	160(%rsp), %eax
        addl	%ecx, %edx
        movl	144(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm3, %xmm0, %xmm4
        # iter_36: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_36: 3 - 3
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r9d, %r15d
        movl	%r13d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_36: 4 - 4
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        xorl	%r14d, %ebx
        xorl	%r10d, %r15d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_36: 5 - 5
        andl	%ebx, %r11d
        andl	%r8d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        vpalignr	$12, %xmm2, %xmm3, %xmm10
        # iter_36: 6 - 7
        addl	%ecx, %r11d
        addl	%edx, %r15d
        movl	%r15d, %ebx
        roll	$8, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        roll	$9, %r15d
        xorl	%ebx, %r15d
        vpxor	%xmm10, %xmm1, %xmm10
        # iter_37: 0 - 0
        movl	148+L_SM3_AVX1_t(%rip), %edx
        movl	%r11d, %ecx
        addl	%r15d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm0, %xmm4
        # iter_37: 1 - 2
        movl	164(%rsp), %eax
        addl	%ecx, %edx
        movl	148(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_37: 3 - 3
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r8d, %r14d
        movl	%r12d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_37: 4 - 4
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        xorl	%r13d, %ebx
        xorl	%r9d, %r14d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_37: 5 - 5
        andl	%ebx, %r10d
        andl	%r15d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_37: 6 - 6
        addl	%ecx, %r10d
        addl	%edx, %r14d
        movl	%r14d, %ebx
        roll	$8, %r14d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_37: 7 - 7
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        roll	$9, %r14d
        xorl	%ebx, %r14d
        # iter_38: 0 - 0
        movl	152+L_SM3_AVX1_t(%rip), %edx
        movl	%r10d, %ecx
        addl	%r14d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_38: 1 - 1
        movl	168(%rsp), %eax
        addl	%ecx, %edx
        movl	152(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_38: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_38: 3 - 4
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r15d, %r13d
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        xorl	%r8d, %r13d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_38: 5 - 5
        andl	%ebx, %r9d
        andl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_38: 6 - 6
        addl	%ecx, %r9d
        addl	%edx, %r13d
        movl	%r13d, %ebx
        roll	$8, %r13d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_38: 7 - 7
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_39: 0 - 0
        movl	156+L_SM3_AVX1_t(%rip), %edx
        movl	%r9d, %ecx
        addl	%r13d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_39: 1 - 1
        movl	172(%rsp), %eax
        addl	%ecx, %edx
        movl	156(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_39: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_39: 3 - 3
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r14d, %r12d
        movl	%r10d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_39: 4 - 4
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        xorl	%r11d, %ebx
        xorl	%r15d, %r12d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_39: 5 - 5
        andl	%ebx, %r8d
        andl	%r13d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm1
        # iter_39: 6 - 6
        addl	%ecx, %r8d
        addl	%edx, %r12d
        movl	%r12d, %ebx
        roll	$8, %r12d
        # iter_39: 7 - 7
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        roll	$9, %r12d
        xorl	%ebx, %r12d
        # msg_sched done: 36-39
        # x2_to_w: 48
        vmovdqu	%xmm0, 192(%rsp)
        vmovdqu	%xmm1, 208(%rsp)
        # msg_sched: 40-43
        # iter_40: 0 - 0
        movl	160+L_SM3_AVX1_t(%rip), %edx
        movl	%r8d, %ecx
        addl	%r12d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm2, %xmm3, %xmm5
        # iter_40: 1 - 1
        movl	176(%rsp), %eax
        addl	%ecx, %edx
        movl	160(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm0, %xmm1, %xmm4
        # iter_40: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_40: 3 - 3
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r13d, %r11d
        movl	%r9d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_40: 4 - 4
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        xorl	%r10d, %ebx
        xorl	%r14d, %r11d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_40: 5 - 5
        andl	%ebx, %r15d
        andl	%r12d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        vpalignr	$12, %xmm3, %xmm0, %xmm10
        # iter_40: 6 - 7
        addl	%ecx, %r15d
        addl	%edx, %r11d
        movl	%r11d, %ebx
        roll	$8, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r11d
        vpxor	%xmm10, %xmm2, %xmm10
        # iter_41: 0 - 0
        movl	164+L_SM3_AVX1_t(%rip), %edx
        movl	%r15d, %ecx
        addl	%r11d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm1, %xmm4
        # iter_41: 1 - 2
        movl	180(%rsp), %eax
        addl	%ecx, %edx
        movl	164(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_41: 3 - 3
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r12d, %r10d
        movl	%r8d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_41: 4 - 4
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        xorl	%r9d, %ebx
        xorl	%r13d, %r10d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_41: 5 - 5
        andl	%ebx, %r14d
        andl	%r11d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_41: 6 - 6
        addl	%ecx, %r14d
        addl	%edx, %r10d
        movl	%r10d, %ebx
        roll	$8, %r10d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_41: 7 - 7
        roll	$9, %r8d
        xorl	%ebx, %r10d
        roll	$19, %r12d
        roll	$9, %r10d
        xorl	%ebx, %r10d
        # iter_42: 0 - 0
        movl	168+L_SM3_AVX1_t(%rip), %edx
        movl	%r14d, %ecx
        addl	%r10d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_42: 1 - 1
        movl	184(%rsp), %eax
        addl	%ecx, %edx
        movl	168(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_42: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_42: 3 - 4
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r11d, %r9d
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        xorl	%r12d, %r9d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_42: 5 - 5
        andl	%ebx, %r13d
        andl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_42: 6 - 6
        addl	%ecx, %r13d
        addl	%edx, %r9d
        movl	%r9d, %ebx
        roll	$8, %r9d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_42: 7 - 7
        roll	$9, %r15d
        xorl	%ebx, %r9d
        roll	$19, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_43: 0 - 0
        movl	172+L_SM3_AVX1_t(%rip), %edx
        movl	%r13d, %ecx
        addl	%r9d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_43: 1 - 1
        movl	188(%rsp), %eax
        addl	%ecx, %edx
        movl	172(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_43: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_43: 3 - 3
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r10d, %r8d
        movl	%r14d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_43: 4 - 4
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        xorl	%r15d, %ebx
        xorl	%r11d, %r8d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_43: 5 - 5
        andl	%ebx, %r12d
        andl	%r9d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm2
        # iter_43: 6 - 6
        addl	%ecx, %r12d
        addl	%edx, %r8d
        movl	%r8d, %ebx
        roll	$8, %r8d
        # iter_43: 7 - 7
        roll	$9, %r14d
        xorl	%ebx, %r8d
        roll	$19, %r10d
        roll	$9, %r8d
        xorl	%ebx, %r8d
        # msg_sched done: 40-43
        # msg_sched: 44-47
        # iter_44: 0 - 0
        movl	176+L_SM3_AVX1_t(%rip), %edx
        movl	%r12d, %ecx
        addl	%r8d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm3, %xmm0, %xmm5
        # iter_44: 1 - 1
        movl	192(%rsp), %eax
        addl	%ecx, %edx
        movl	176(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm1, %xmm2, %xmm4
        # iter_44: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_44: 3 - 3
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r9d, %r15d
        movl	%r13d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_44: 4 - 4
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        xorl	%r14d, %ebx
        xorl	%r10d, %r15d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_44: 5 - 5
        andl	%ebx, %r11d
        andl	%r8d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        vpalignr	$12, %xmm0, %xmm1, %xmm10
        # iter_44: 6 - 7
        addl	%ecx, %r11d
        addl	%edx, %r15d
        movl	%r15d, %ebx
        roll	$8, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        roll	$9, %r15d
        xorl	%ebx, %r15d
        vpxor	%xmm10, %xmm3, %xmm10
        # iter_45: 0 - 0
        movl	180+L_SM3_AVX1_t(%rip), %edx
        movl	%r11d, %ecx
        addl	%r15d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm2, %xmm4
        # iter_45: 1 - 2
        movl	196(%rsp), %eax
        addl	%ecx, %edx
        movl	180(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_45: 3 - 3
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r8d, %r14d
        movl	%r12d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_45: 4 - 4
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        xorl	%r13d, %ebx
        xorl	%r9d, %r14d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_45: 5 - 5
        andl	%ebx, %r10d
        andl	%r15d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_45: 6 - 6
        addl	%ecx, %r10d
        addl	%edx, %r14d
        movl	%r14d, %ebx
        roll	$8, %r14d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_45: 7 - 7
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        roll	$9, %r14d
        xorl	%ebx, %r14d
        # iter_46: 0 - 0
        movl	184+L_SM3_AVX1_t(%rip), %edx
        movl	%r10d, %ecx
        addl	%r14d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_46: 1 - 1
        movl	200(%rsp), %eax
        addl	%ecx, %edx
        movl	184(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_46: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_46: 3 - 4
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r15d, %r13d
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        xorl	%r8d, %r13d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_46: 5 - 5
        andl	%ebx, %r9d
        andl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_46: 6 - 6
        addl	%ecx, %r9d
        addl	%edx, %r13d
        movl	%r13d, %ebx
        roll	$8, %r13d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_46: 7 - 7
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_47: 0 - 0
        movl	188+L_SM3_AVX1_t(%rip), %edx
        movl	%r9d, %ecx
        addl	%r13d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_47: 1 - 1
        movl	204(%rsp), %eax
        addl	%ecx, %edx
        movl	188(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_47: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_47: 3 - 3
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r14d, %r12d
        movl	%r10d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_47: 4 - 4
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        xorl	%r11d, %ebx
        xorl	%r15d, %r12d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_47: 5 - 5
        andl	%ebx, %r8d
        andl	%r13d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm3
        # iter_47: 6 - 6
        addl	%ecx, %r8d
        addl	%edx, %r12d
        movl	%r12d, %ebx
        roll	$8, %r12d
        # iter_47: 7 - 7
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        roll	$9, %r12d
        xorl	%ebx, %r12d
        # msg_sched done: 44-47
        # x2_to_w: 56
        vmovdqu	%xmm2, 224(%rsp)
        vmovdqu	%xmm3, 240(%rsp)
        # msg_sched: 48-51
        # iter_48: 0 - 0
        movl	192+L_SM3_AVX1_t(%rip), %edx
        movl	%r8d, %ecx
        addl	%r12d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm0, %xmm1, %xmm5
        # iter_48: 1 - 1
        movl	208(%rsp), %eax
        addl	%ecx, %edx
        movl	192(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm2, %xmm3, %xmm4
        # iter_48: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_48: 3 - 3
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r13d, %r11d
        movl	%r9d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_48: 4 - 4
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        xorl	%r10d, %ebx
        xorl	%r14d, %r11d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_48: 5 - 5
        andl	%ebx, %r15d
        andl	%r12d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        vpalignr	$12, %xmm1, %xmm2, %xmm10
        # iter_48: 6 - 7
        addl	%ecx, %r15d
        addl	%edx, %r11d
        movl	%r11d, %ebx
        roll	$8, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r11d
        vpxor	%xmm10, %xmm0, %xmm10
        # iter_49: 0 - 0
        movl	196+L_SM3_AVX1_t(%rip), %edx
        movl	%r15d, %ecx
        addl	%r11d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm3, %xmm4
        # iter_49: 1 - 2
        movl	212(%rsp), %eax
        addl	%ecx, %edx
        movl	196(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_49: 3 - 3
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r12d, %r10d
        movl	%r8d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_49: 4 - 4
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        xorl	%r9d, %ebx
        xorl	%r13d, %r10d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_49: 5 - 5
        andl	%ebx, %r14d
        andl	%r11d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_49: 6 - 6
        addl	%ecx, %r14d
        addl	%edx, %r10d
        movl	%r10d, %ebx
        roll	$8, %r10d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_49: 7 - 7
        roll	$9, %r8d
        xorl	%ebx, %r10d
        roll	$19, %r12d
        roll	$9, %r10d
        xorl	%ebx, %r10d
        # iter_50: 0 - 0
        movl	200+L_SM3_AVX1_t(%rip), %edx
        movl	%r14d, %ecx
        addl	%r10d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_50: 1 - 1
        movl	216(%rsp), %eax
        addl	%ecx, %edx
        movl	200(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_50: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_50: 3 - 4
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r11d, %r9d
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        xorl	%r12d, %r9d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_50: 5 - 5
        andl	%ebx, %r13d
        andl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_50: 6 - 6
        addl	%ecx, %r13d
        addl	%edx, %r9d
        movl	%r9d, %ebx
        roll	$8, %r9d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_50: 7 - 7
        roll	$9, %r15d
        xorl	%ebx, %r9d
        roll	$19, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_51: 0 - 0
        movl	204+L_SM3_AVX1_t(%rip), %edx
        movl	%r13d, %ecx
        addl	%r9d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_51: 1 - 1
        movl	220(%rsp), %eax
        addl	%ecx, %edx
        movl	204(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_51: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_51: 3 - 3
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r10d, %r8d
        movl	%r14d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_51: 4 - 4
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        xorl	%r15d, %ebx
        xorl	%r11d, %r8d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_51: 5 - 5
        andl	%ebx, %r12d
        andl	%r9d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm0
        # iter_51: 6 - 6
        addl	%ecx, %r12d
        addl	%edx, %r8d
        movl	%r8d, %ebx
        roll	$8, %r8d
        # iter_51: 7 - 7
        roll	$9, %r14d
        xorl	%ebx, %r8d
        roll	$19, %r10d
        roll	$9, %r8d
        xorl	%ebx, %r8d
        # msg_sched done: 48-51
        # iter_52: 0 - 7
        movl	208+L_SM3_AVX1_t(%rip), %edx
        movl	%r12d, %ecx
        addl	%r8d, %edx
        roll	$12, %ecx
        movl	224(%rsp), %eax
        addl	%ecx, %edx
        movl	208(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r9d, %r15d
        movl	%r13d, %r11d
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        xorl	%r14d, %ebx
        xorl	%r10d, %r15d
        andl	%ebx, %r11d
        andl	%r8d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        addl	%ecx, %r11d
        addl	%edx, %r15d
        movl	%r15d, %ebx
        roll	$8, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        roll	$9, %r15d
        xorl	%ebx, %r15d
        # iter_53: 0 - 7
        movl	212+L_SM3_AVX1_t(%rip), %edx
        movl	%r11d, %ecx
        addl	%r15d, %edx
        roll	$12, %ecx
        movl	228(%rsp), %eax
        addl	%ecx, %edx
        movl	212(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r8d, %r14d
        movl	%r12d, %r10d
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        xorl	%r13d, %ebx
        xorl	%r9d, %r14d
        andl	%ebx, %r10d
        andl	%r15d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        addl	%ecx, %r10d
        addl	%edx, %r14d
        movl	%r14d, %ebx
        roll	$8, %r14d
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        roll	$9, %r14d
        xorl	%ebx, %r14d
        # iter_54: 0 - 7
        movl	216+L_SM3_AVX1_t(%rip), %edx
        movl	%r10d, %ecx
        addl	%r14d, %edx
        roll	$12, %ecx
        movl	232(%rsp), %eax
        addl	%ecx, %edx
        movl	216(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r15d, %r13d
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        xorl	%r8d, %r13d
        andl	%ebx, %r9d
        andl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        addl	%ecx, %r9d
        addl	%edx, %r13d
        movl	%r13d, %ebx
        roll	$8, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r13d
        # iter_55: 0 - 7
        movl	220+L_SM3_AVX1_t(%rip), %edx
        movl	%r9d, %ecx
        addl	%r13d, %edx
        roll	$12, %ecx
        movl	236(%rsp), %eax
        addl	%ecx, %edx
        movl	220(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r14d, %r12d
        movl	%r10d, %r8d
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        xorl	%r11d, %ebx
        xorl	%r15d, %r12d
        andl	%ebx, %r8d
        andl	%r13d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        addl	%ecx, %r8d
        addl	%edx, %r12d
        movl	%r12d, %ebx
        roll	$8, %r12d
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        roll	$9, %r12d
        xorl	%ebx, %r12d
        # x0_to_w: 64
        vmovdqu	%xmm0, 256(%rsp)
        # iter_56: 0 - 7
        movl	224+L_SM3_AVX1_t(%rip), %edx
        movl	%r8d, %ecx
        addl	%r12d, %edx
        roll	$12, %ecx
        movl	240(%rsp), %eax
        addl	%ecx, %edx
        movl	224(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r13d, %r11d
        movl	%r9d, %r15d
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        xorl	%r10d, %ebx
        xorl	%r14d, %r11d
        andl	%ebx, %r15d
        andl	%r12d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        addl	%ecx, %r15d
        addl	%edx, %r11d
        movl	%r11d, %ebx
        roll	$8, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r11d
        # iter_57: 0 - 7
        movl	228+L_SM3_AVX1_t(%rip), %edx
        movl	%r15d, %ecx
        addl	%r11d, %edx
        roll	$12, %ecx
        movl	244(%rsp), %eax
        addl	%ecx, %edx
        movl	228(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r12d, %r10d
        movl	%r8d, %r14d
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        xorl	%r9d, %ebx
        xorl	%r13d, %r10d
        andl	%ebx, %r14d
        andl	%r11d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        addl	%ecx, %r14d
        addl	%edx, %r10d
        movl	%r10d, %ebx
        roll	$8, %r10d
        roll	$9, %r8d
        xorl	%ebx, %r10d
        roll	$19, %r12d
        roll	$9, %r10d
        xorl	%ebx, %r10d
        # iter_58: 0 - 7
        movl	232+L_SM3_AVX1_t(%rip), %edx
        movl	%r14d, %ecx
        addl	%r10d, %edx
        roll	$12, %ecx
        movl	248(%rsp), %eax
        addl	%ecx, %edx
        movl	232(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r11d, %r9d
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        xorl	%r12d, %r9d
        andl	%ebx, %r13d
        andl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        addl	%ecx, %r13d
        addl	%edx, %r9d
        movl	%r9d, %ebx
        roll	$8, %r9d
        roll	$9, %r15d
        xorl	%ebx, %r9d
        roll	$19, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r9d
        # iter_59: 0 - 7
        movl	236+L_SM3_AVX1_t(%rip), %edx
        movl	%r13d, %ecx
        addl	%r9d, %edx
        roll	$12, %ecx
        movl	252(%rsp), %eax
        addl	%ecx, %edx
        movl	236(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r10d, %r8d
        movl	%r14d, %r12d
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        xorl	%r15d, %ebx
        xorl	%r11d, %r8d
        andl	%ebx, %r12d
        andl	%r9d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        addl	%ecx, %r12d
        addl	%edx, %r8d
        movl	%r8d, %ebx
        roll	$8, %r8d
        roll	$9, %r14d
        xorl	%ebx, %r8d
        roll	$19, %r10d
        roll	$9, %r8d
        xorl	%ebx, %r8d
        # iter_60: 0 - 7
        movl	240+L_SM3_AVX1_t(%rip), %edx
        movl	%r12d, %ecx
        addl	%r8d, %edx
        roll	$12, %ecx
        movl	256(%rsp), %eax
        addl	%ecx, %edx
        movl	240(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r9d, %r15d
        movl	%r13d, %r11d
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        xorl	%r14d, %ebx
        xorl	%r10d, %r15d
        andl	%ebx, %r11d
        andl	%r8d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        addl	%ecx, %r11d
        addl	%edx, %r15d
        movl	%r15d, %ebx
        roll	$8, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        roll	$9, %r15d
        xorl	%ebx, %r15d
        # iter_61: 0 - 7
        movl	244+L_SM3_AVX1_t(%rip), %edx
        movl	%r11d, %ecx
        addl	%r15d, %edx
        roll	$12, %ecx
        movl	260(%rsp), %eax
        addl	%ecx, %edx
        movl	244(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r8d, %r14d
        movl	%r12d, %r10d
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        xorl	%r13d, %ebx
        xorl	%r9d, %r14d
        andl	%ebx, %r10d
        andl	%r15d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        addl	%ecx, %r10d
        addl	%edx, %r14d
        movl	%r14d, %ebx
        roll	$8, %r14d
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        roll	$9, %r14d
        xorl	%ebx, %r14d
        # iter_62: 0 - 7
        movl	248+L_SM3_AVX1_t(%rip), %edx
        movl	%r10d, %ecx
        addl	%r14d, %edx
        roll	$12, %ecx
        movl	264(%rsp), %eax
        addl	%ecx, %edx
        movl	248(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r15d, %r13d
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        xorl	%r8d, %r13d
        andl	%ebx, %r9d
        andl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        addl	%ecx, %r9d
        addl	%edx, %r13d
        movl	%r13d, %ebx
        roll	$8, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r13d
        # iter_63: 0 - 7
        movl	252+L_SM3_AVX1_t(%rip), %edx
        movl	%r9d, %ecx
        addl	%r13d, %edx
        roll	$12, %ecx
        movl	268(%rsp), %eax
        addl	%ecx, %edx
        movl	252(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r14d, %r12d
        movl	%r10d, %r8d
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        xorl	%r11d, %ebx
        xorl	%r15d, %r12d
        andl	%ebx, %r8d
        andl	%r13d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        addl	%ecx, %r8d
        addl	%edx, %r12d
        movl	%r12d, %ebx
        roll	$8, %r12d
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        roll	$9, %r12d
        xorl	%ebx, %r12d
        xorl	%r8d, (%rdi)
        xorl	%r9d, 4(%rdi)
        xorl	%r10d, 8(%rdi)
        xorl	%r11d, 12(%rdi)
        xorl	%r12d, 16(%rdi)
        xorl	%r13d, 20(%rdi)
        xorl	%r14d, 24(%rdi)
        xorl	%r15d, 28(%rdi)
        xorq	%rax, %rax
        vzeroupper
        addq	$0x110, %rsp
        popq	%r15
        popq	%r14
        popq	%r13
        popq	%r12
        popq	%rbx
        repz retq
#ifndef __APPLE__
.size	sm3_compress_avx1,.-sm3_compress_avx1
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl	sm3_compress_len_avx1
.type	sm3_compress_len_avx1,@function
.align	16
sm3_compress_len_avx1:
#else
.section	__TEXT,__text
.globl	_sm3_compress_len_avx1
.p2align	4
_sm3_compress_len_avx1:
#endif /* __APPLE__ */
        pushq	%rbx
        pushq	%r12
        pushq	%r13
        pushq	%r14
        pushq	%r15
        pushq	%rbp
        movq	%rsi, %rbp
        movq	%rdx, %rsi
        subq	$0x110, %rsp
        vmovdqa	L_SM3_AVX1_flip_mask(%rip), %xmm11
        movl	(%rdi), %r8d
        movl	4(%rdi), %r9d
        movl	8(%rdi), %r10d
        movl	12(%rdi), %r11d
        movl	16(%rdi), %r12d
        movl	20(%rdi), %r13d
        movl	24(%rdi), %r14d
        movl	28(%rdi), %r15d
        # Start of loop processing a block
L_SM3_AVX1len_start:
        # X0, X1, X2, X3 = W[0..15]
        vmovdqu	(%rbp), %xmm0
        vmovdqu	16(%rbp), %xmm1
        vpshufb	%xmm11, %xmm0, %xmm0
        vpshufb	%xmm11, %xmm1, %xmm1
        vmovdqu	32(%rbp), %xmm2
        vmovdqu	48(%rbp), %xmm3
        vpshufb	%xmm11, %xmm2, %xmm2
        vpshufb	%xmm11, %xmm3, %xmm3
        # x_to_w: 0
        vmovdqu	%xmm0, (%rsp)
        vmovdqu	%xmm1, 16(%rsp)
        vmovdqu	%xmm2, 32(%rsp)
        vmovdqu	%xmm3, 48(%rsp)
        # msg_sched: 0-3
        # iter_0: 0 - 0
        movl	0+L_SM3_AVX1_t(%rip), %edx
        movl	%r8d, %ecx
        addl	%r12d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm0, %xmm1, %xmm5
        # iter_0: 1 - 1
        movl	16(%rsp), %eax
        addl	%ecx, %edx
        movl	(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm2, %xmm3, %xmm4
        # iter_0: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_0: 3 - 3
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r8d, %r15d
        movl	%r12d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_0: 4 - 4
        xorl	%r9d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        xorl	%r14d, %r11d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_0: 5 - 5
        addl	%ecx, %r15d
        addl	%edx, %r11d
        movl	%r11d, %ebx
        roll	$8, %r11d
        vpalignr	$12, %xmm1, %xmm2, %xmm10
        # iter_0: 6 - 7
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r11d
        vpxor	%xmm10, %xmm0, %xmm10
        # iter_1: 0 - 0
        movl	4+L_SM3_AVX1_t(%rip), %edx
        movl	%r15d, %ecx
        addl	%r11d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm3, %xmm4
        # iter_1: 1 - 2
        movl	20(%rsp), %eax
        addl	%ecx, %edx
        movl	4(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_1: 3 - 3
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r15d, %r14d
        movl	%r11d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_1: 4 - 4
        xorl	%r8d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        xorl	%r13d, %r10d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_1: 5 - 5
        addl	%ecx, %r14d
        addl	%edx, %r10d
        movl	%r10d, %ebx
        roll	$8, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_1: 6 - 6
        roll	$9, %r8d
        xorl	%ebx, %r10d
        roll	$19, %r12d
        roll	$9, %r10d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_1: 7 - 7
        xorl	%ebx, %r10d
        # iter_2: 0 - 0
        movl	8+L_SM3_AVX1_t(%rip), %edx
        movl	%r14d, %ecx
        addl	%r10d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_2: 1 - 1
        movl	24(%rsp), %eax
        addl	%ecx, %edx
        movl	8(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_2: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_2: 3 - 4
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r14d, %r13d
        movl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        xorl	%r12d, %r9d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_2: 5 - 5
        addl	%ecx, %r13d
        addl	%edx, %r9d
        movl	%r9d, %ebx
        roll	$8, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_2: 6 - 6
        roll	$9, %r15d
        xorl	%ebx, %r9d
        roll	$19, %r11d
        roll	$9, %r9d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_2: 7 - 7
        xorl	%ebx, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_3: 0 - 0
        movl	12+L_SM3_AVX1_t(%rip), %edx
        movl	%r13d, %ecx
        addl	%r9d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_3: 1 - 1
        movl	28(%rsp), %eax
        addl	%ecx, %edx
        movl	12(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_3: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_3: 3 - 3
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r13d, %r12d
        movl	%r9d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_3: 4 - 4
        xorl	%r14d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        xorl	%r11d, %r8d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_3: 5 - 5
        addl	%ecx, %r12d
        addl	%edx, %r8d
        movl	%r8d, %ebx
        roll	$8, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm0
        # iter_3: 6 - 6
        roll	$9, %r14d
        xorl	%ebx, %r8d
        roll	$19, %r10d
        roll	$9, %r8d
        # iter_3: 7 - 7
        xorl	%ebx, %r8d
        # msg_sched done: 0-3
        # msg_sched: 4-7
        # iter_4: 0 - 0
        movl	16+L_SM3_AVX1_t(%rip), %edx
        movl	%r12d, %ecx
        addl	%r8d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm1, %xmm2, %xmm5
        # iter_4: 1 - 1
        movl	32(%rsp), %eax
        addl	%ecx, %edx
        movl	16(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm3, %xmm0, %xmm4
        # iter_4: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_4: 3 - 3
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r12d, %r11d
        movl	%r8d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_4: 4 - 4
        xorl	%r13d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        xorl	%r10d, %r15d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_4: 5 - 5
        addl	%ecx, %r11d
        addl	%edx, %r15d
        movl	%r15d, %ebx
        roll	$8, %r15d
        vpalignr	$12, %xmm2, %xmm3, %xmm10
        # iter_4: 6 - 7
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        roll	$9, %r15d
        xorl	%ebx, %r15d
        vpxor	%xmm10, %xmm1, %xmm10
        # iter_5: 0 - 0
        movl	20+L_SM3_AVX1_t(%rip), %edx
        movl	%r11d, %ecx
        addl	%r15d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm0, %xmm4
        # iter_5: 1 - 2
        movl	36(%rsp), %eax
        addl	%ecx, %edx
        movl	20(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_5: 3 - 3
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r11d, %r10d
        movl	%r15d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_5: 4 - 4
        xorl	%r12d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        xorl	%r9d, %r14d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_5: 5 - 5
        addl	%ecx, %r10d
        addl	%edx, %r14d
        movl	%r14d, %ebx
        roll	$8, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_5: 6 - 6
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        roll	$9, %r14d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_5: 7 - 7
        xorl	%ebx, %r14d
        # iter_6: 0 - 0
        movl	24+L_SM3_AVX1_t(%rip), %edx
        movl	%r10d, %ecx
        addl	%r14d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_6: 1 - 1
        movl	40(%rsp), %eax
        addl	%ecx, %edx
        movl	24(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_6: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_6: 3 - 4
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r10d, %r9d
        movl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        xorl	%r8d, %r13d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_6: 5 - 5
        addl	%ecx, %r9d
        addl	%edx, %r13d
        movl	%r13d, %ebx
        roll	$8, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_6: 6 - 6
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        roll	$9, %r13d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_6: 7 - 7
        xorl	%ebx, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_7: 0 - 0
        movl	28+L_SM3_AVX1_t(%rip), %edx
        movl	%r9d, %ecx
        addl	%r13d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_7: 1 - 1
        movl	44(%rsp), %eax
        addl	%ecx, %edx
        movl	28(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_7: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_7: 3 - 3
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r9d, %r8d
        movl	%r13d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_7: 4 - 4
        xorl	%r10d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        xorl	%r15d, %r12d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_7: 5 - 5
        addl	%ecx, %r8d
        addl	%edx, %r12d
        movl	%r12d, %ebx
        roll	$8, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm1
        # iter_7: 6 - 6
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        roll	$9, %r12d
        # iter_7: 7 - 7
        xorl	%ebx, %r12d
        # msg_sched done: 4-7
        # x2_to_w: 16
        vmovdqu	%xmm0, 64(%rsp)
        vmovdqu	%xmm1, 80(%rsp)
        # msg_sched: 8-11
        # iter_8: 0 - 0
        movl	32+L_SM3_AVX1_t(%rip), %edx
        movl	%r8d, %ecx
        addl	%r12d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm2, %xmm3, %xmm5
        # iter_8: 1 - 1
        movl	48(%rsp), %eax
        addl	%ecx, %edx
        movl	32(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm0, %xmm1, %xmm4
        # iter_8: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_8: 3 - 3
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r8d, %r15d
        movl	%r12d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_8: 4 - 4
        xorl	%r9d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        xorl	%r14d, %r11d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_8: 5 - 5
        addl	%ecx, %r15d
        addl	%edx, %r11d
        movl	%r11d, %ebx
        roll	$8, %r11d
        vpalignr	$12, %xmm3, %xmm0, %xmm10
        # iter_8: 6 - 7
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r11d
        vpxor	%xmm10, %xmm2, %xmm10
        # iter_9: 0 - 0
        movl	36+L_SM3_AVX1_t(%rip), %edx
        movl	%r15d, %ecx
        addl	%r11d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm1, %xmm4
        # iter_9: 1 - 2
        movl	52(%rsp), %eax
        addl	%ecx, %edx
        movl	36(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_9: 3 - 3
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r15d, %r14d
        movl	%r11d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_9: 4 - 4
        xorl	%r8d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        xorl	%r13d, %r10d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_9: 5 - 5
        addl	%ecx, %r14d
        addl	%edx, %r10d
        movl	%r10d, %ebx
        roll	$8, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_9: 6 - 6
        roll	$9, %r8d
        xorl	%ebx, %r10d
        roll	$19, %r12d
        roll	$9, %r10d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_9: 7 - 7
        xorl	%ebx, %r10d
        # iter_10: 0 - 0
        movl	40+L_SM3_AVX1_t(%rip), %edx
        movl	%r14d, %ecx
        addl	%r10d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_10: 1 - 1
        movl	56(%rsp), %eax
        addl	%ecx, %edx
        movl	40(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_10: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_10: 3 - 4
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r14d, %r13d
        movl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        xorl	%r12d, %r9d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_10: 5 - 5
        addl	%ecx, %r13d
        addl	%edx, %r9d
        movl	%r9d, %ebx
        roll	$8, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_10: 6 - 6
        roll	$9, %r15d
        xorl	%ebx, %r9d
        roll	$19, %r11d
        roll	$9, %r9d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_10: 7 - 7
        xorl	%ebx, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_11: 0 - 0
        movl	44+L_SM3_AVX1_t(%rip), %edx
        movl	%r13d, %ecx
        addl	%r9d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_11: 1 - 1
        movl	60(%rsp), %eax
        addl	%ecx, %edx
        movl	44(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_11: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_11: 3 - 3
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r13d, %r12d
        movl	%r9d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_11: 4 - 4
        xorl	%r14d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        xorl	%r11d, %r8d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_11: 5 - 5
        addl	%ecx, %r12d
        addl	%edx, %r8d
        movl	%r8d, %ebx
        roll	$8, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm2
        # iter_11: 6 - 6
        roll	$9, %r14d
        xorl	%ebx, %r8d
        roll	$19, %r10d
        roll	$9, %r8d
        # iter_11: 7 - 7
        xorl	%ebx, %r8d
        # msg_sched done: 8-11
        # msg_sched: 12-15
        # iter_12: 0 - 0
        movl	48+L_SM3_AVX1_t(%rip), %edx
        movl	%r12d, %ecx
        addl	%r8d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm3, %xmm0, %xmm5
        # iter_12: 1 - 1
        movl	64(%rsp), %eax
        addl	%ecx, %edx
        movl	48(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm1, %xmm2, %xmm4
        # iter_12: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_12: 3 - 3
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r12d, %r11d
        movl	%r8d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_12: 4 - 4
        xorl	%r13d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        xorl	%r10d, %r15d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_12: 5 - 5
        addl	%ecx, %r11d
        addl	%edx, %r15d
        movl	%r15d, %ebx
        roll	$8, %r15d
        vpalignr	$12, %xmm0, %xmm1, %xmm10
        # iter_12: 6 - 7
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        roll	$9, %r15d
        xorl	%ebx, %r15d
        vpxor	%xmm10, %xmm3, %xmm10
        # iter_13: 0 - 0
        movl	52+L_SM3_AVX1_t(%rip), %edx
        movl	%r11d, %ecx
        addl	%r15d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm2, %xmm4
        # iter_13: 1 - 2
        movl	68(%rsp), %eax
        addl	%ecx, %edx
        movl	52(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_13: 3 - 3
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r11d, %r10d
        movl	%r15d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_13: 4 - 4
        xorl	%r12d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        xorl	%r9d, %r14d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_13: 5 - 5
        addl	%ecx, %r10d
        addl	%edx, %r14d
        movl	%r14d, %ebx
        roll	$8, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_13: 6 - 6
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        roll	$9, %r14d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_13: 7 - 7
        xorl	%ebx, %r14d
        # iter_14: 0 - 0
        movl	56+L_SM3_AVX1_t(%rip), %edx
        movl	%r10d, %ecx
        addl	%r14d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_14: 1 - 1
        movl	72(%rsp), %eax
        addl	%ecx, %edx
        movl	56(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_14: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_14: 3 - 4
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r10d, %r9d
        movl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        xorl	%r8d, %r13d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_14: 5 - 5
        addl	%ecx, %r9d
        addl	%edx, %r13d
        movl	%r13d, %ebx
        roll	$8, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_14: 6 - 6
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        roll	$9, %r13d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_14: 7 - 7
        xorl	%ebx, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_15: 0 - 0
        movl	60+L_SM3_AVX1_t(%rip), %edx
        movl	%r9d, %ecx
        addl	%r13d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_15: 1 - 1
        movl	76(%rsp), %eax
        addl	%ecx, %edx
        movl	60(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_15: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_15: 3 - 3
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r9d, %r8d
        movl	%r13d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_15: 4 - 4
        xorl	%r10d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        xorl	%r15d, %r12d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_15: 5 - 5
        addl	%ecx, %r8d
        addl	%edx, %r12d
        movl	%r12d, %ebx
        roll	$8, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm3
        # iter_15: 6 - 6
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        roll	$9, %r12d
        # iter_15: 7 - 7
        xorl	%ebx, %r12d
        # msg_sched done: 12-15
        # x2_to_w: 24
        vmovdqu	%xmm2, 96(%rsp)
        vmovdqu	%xmm3, 112(%rsp)
        # msg_sched: 16-19
        # iter_16: 0 - 0
        movl	64+L_SM3_AVX1_t(%rip), %edx
        movl	%r8d, %ecx
        addl	%r12d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm0, %xmm1, %xmm5
        # iter_16: 1 - 1
        movl	80(%rsp), %eax
        addl	%ecx, %edx
        movl	64(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm2, %xmm3, %xmm4
        # iter_16: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_16: 3 - 3
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r13d, %r11d
        movl	%r9d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_16: 4 - 4
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        xorl	%r10d, %ebx
        xorl	%r14d, %r11d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_16: 5 - 5
        andl	%ebx, %r15d
        andl	%r12d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        vpalignr	$12, %xmm1, %xmm2, %xmm10
        # iter_16: 6 - 7
        addl	%ecx, %r15d
        addl	%edx, %r11d
        movl	%r11d, %ebx
        roll	$8, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r11d
        vpxor	%xmm10, %xmm0, %xmm10
        # iter_17: 0 - 0
        movl	68+L_SM3_AVX1_t(%rip), %edx
        movl	%r15d, %ecx
        addl	%r11d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm3, %xmm4
        # iter_17: 1 - 2
        movl	84(%rsp), %eax
        addl	%ecx, %edx
        movl	68(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_17: 3 - 3
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r12d, %r10d
        movl	%r8d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_17: 4 - 4
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        xorl	%r9d, %ebx
        xorl	%r13d, %r10d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_17: 5 - 5
        andl	%ebx, %r14d
        andl	%r11d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_17: 6 - 6
        addl	%ecx, %r14d
        addl	%edx, %r10d
        movl	%r10d, %ebx
        roll	$8, %r10d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_17: 7 - 7
        roll	$9, %r8d
        xorl	%ebx, %r10d
        roll	$19, %r12d
        roll	$9, %r10d
        xorl	%ebx, %r10d
        # iter_18: 0 - 0
        movl	72+L_SM3_AVX1_t(%rip), %edx
        movl	%r14d, %ecx
        addl	%r10d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_18: 1 - 1
        movl	88(%rsp), %eax
        addl	%ecx, %edx
        movl	72(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_18: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_18: 3 - 4
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r11d, %r9d
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        xorl	%r12d, %r9d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_18: 5 - 5
        andl	%ebx, %r13d
        andl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_18: 6 - 6
        addl	%ecx, %r13d
        addl	%edx, %r9d
        movl	%r9d, %ebx
        roll	$8, %r9d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_18: 7 - 7
        roll	$9, %r15d
        xorl	%ebx, %r9d
        roll	$19, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_19: 0 - 0
        movl	76+L_SM3_AVX1_t(%rip), %edx
        movl	%r13d, %ecx
        addl	%r9d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_19: 1 - 1
        movl	92(%rsp), %eax
        addl	%ecx, %edx
        movl	76(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_19: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_19: 3 - 3
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r10d, %r8d
        movl	%r14d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_19: 4 - 4
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        xorl	%r15d, %ebx
        xorl	%r11d, %r8d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_19: 5 - 5
        andl	%ebx, %r12d
        andl	%r9d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm0
        # iter_19: 6 - 6
        addl	%ecx, %r12d
        addl	%edx, %r8d
        movl	%r8d, %ebx
        roll	$8, %r8d
        # iter_19: 7 - 7
        roll	$9, %r14d
        xorl	%ebx, %r8d
        roll	$19, %r10d
        roll	$9, %r8d
        xorl	%ebx, %r8d
        # msg_sched done: 16-19
        # msg_sched: 20-23
        # iter_20: 0 - 0
        movl	80+L_SM3_AVX1_t(%rip), %edx
        movl	%r12d, %ecx
        addl	%r8d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm1, %xmm2, %xmm5
        # iter_20: 1 - 1
        movl	96(%rsp), %eax
        addl	%ecx, %edx
        movl	80(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm3, %xmm0, %xmm4
        # iter_20: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_20: 3 - 3
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r9d, %r15d
        movl	%r13d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_20: 4 - 4
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        xorl	%r14d, %ebx
        xorl	%r10d, %r15d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_20: 5 - 5
        andl	%ebx, %r11d
        andl	%r8d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        vpalignr	$12, %xmm2, %xmm3, %xmm10
        # iter_20: 6 - 7
        addl	%ecx, %r11d
        addl	%edx, %r15d
        movl	%r15d, %ebx
        roll	$8, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        roll	$9, %r15d
        xorl	%ebx, %r15d
        vpxor	%xmm10, %xmm1, %xmm10
        # iter_21: 0 - 0
        movl	84+L_SM3_AVX1_t(%rip), %edx
        movl	%r11d, %ecx
        addl	%r15d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm0, %xmm4
        # iter_21: 1 - 2
        movl	100(%rsp), %eax
        addl	%ecx, %edx
        movl	84(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_21: 3 - 3
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r8d, %r14d
        movl	%r12d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_21: 4 - 4
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        xorl	%r13d, %ebx
        xorl	%r9d, %r14d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_21: 5 - 5
        andl	%ebx, %r10d
        andl	%r15d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_21: 6 - 6
        addl	%ecx, %r10d
        addl	%edx, %r14d
        movl	%r14d, %ebx
        roll	$8, %r14d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_21: 7 - 7
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        roll	$9, %r14d
        xorl	%ebx, %r14d
        # iter_22: 0 - 0
        movl	88+L_SM3_AVX1_t(%rip), %edx
        movl	%r10d, %ecx
        addl	%r14d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_22: 1 - 1
        movl	104(%rsp), %eax
        addl	%ecx, %edx
        movl	88(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_22: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_22: 3 - 4
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r15d, %r13d
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        xorl	%r8d, %r13d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_22: 5 - 5
        andl	%ebx, %r9d
        andl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_22: 6 - 6
        addl	%ecx, %r9d
        addl	%edx, %r13d
        movl	%r13d, %ebx
        roll	$8, %r13d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_22: 7 - 7
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_23: 0 - 0
        movl	92+L_SM3_AVX1_t(%rip), %edx
        movl	%r9d, %ecx
        addl	%r13d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_23: 1 - 1
        movl	108(%rsp), %eax
        addl	%ecx, %edx
        movl	92(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_23: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_23: 3 - 3
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r14d, %r12d
        movl	%r10d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_23: 4 - 4
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        xorl	%r11d, %ebx
        xorl	%r15d, %r12d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_23: 5 - 5
        andl	%ebx, %r8d
        andl	%r13d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm1
        # iter_23: 6 - 6
        addl	%ecx, %r8d
        addl	%edx, %r12d
        movl	%r12d, %ebx
        roll	$8, %r12d
        # iter_23: 7 - 7
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        roll	$9, %r12d
        xorl	%ebx, %r12d
        # msg_sched done: 20-23
        # x2_to_w: 32
        vmovdqu	%xmm0, 128(%rsp)
        vmovdqu	%xmm1, 144(%rsp)
        # msg_sched: 24-27
        # iter_24: 0 - 0
        movl	96+L_SM3_AVX1_t(%rip), %edx
        movl	%r8d, %ecx
        addl	%r12d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm2, %xmm3, %xmm5
        # iter_24: 1 - 1
        movl	112(%rsp), %eax
        addl	%ecx, %edx
        movl	96(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm0, %xmm1, %xmm4
        # iter_24: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_24: 3 - 3
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r13d, %r11d
        movl	%r9d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_24: 4 - 4
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        xorl	%r10d, %ebx
        xorl	%r14d, %r11d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_24: 5 - 5
        andl	%ebx, %r15d
        andl	%r12d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        vpalignr	$12, %xmm3, %xmm0, %xmm10
        # iter_24: 6 - 7
        addl	%ecx, %r15d
        addl	%edx, %r11d
        movl	%r11d, %ebx
        roll	$8, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r11d
        vpxor	%xmm10, %xmm2, %xmm10
        # iter_25: 0 - 0
        movl	100+L_SM3_AVX1_t(%rip), %edx
        movl	%r15d, %ecx
        addl	%r11d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm1, %xmm4
        # iter_25: 1 - 2
        movl	116(%rsp), %eax
        addl	%ecx, %edx
        movl	100(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_25: 3 - 3
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r12d, %r10d
        movl	%r8d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_25: 4 - 4
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        xorl	%r9d, %ebx
        xorl	%r13d, %r10d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_25: 5 - 5
        andl	%ebx, %r14d
        andl	%r11d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_25: 6 - 6
        addl	%ecx, %r14d
        addl	%edx, %r10d
        movl	%r10d, %ebx
        roll	$8, %r10d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_25: 7 - 7
        roll	$9, %r8d
        xorl	%ebx, %r10d
        roll	$19, %r12d
        roll	$9, %r10d
        xorl	%ebx, %r10d
        # iter_26: 0 - 0
        movl	104+L_SM3_AVX1_t(%rip), %edx
        movl	%r14d, %ecx
        addl	%r10d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_26: 1 - 1
        movl	120(%rsp), %eax
        addl	%ecx, %edx
        movl	104(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_26: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_26: 3 - 4
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r11d, %r9d
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        xorl	%r12d, %r9d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_26: 5 - 5
        andl	%ebx, %r13d
        andl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_26: 6 - 6
        addl	%ecx, %r13d
        addl	%edx, %r9d
        movl	%r9d, %ebx
        roll	$8, %r9d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_26: 7 - 7
        roll	$9, %r15d
        xorl	%ebx, %r9d
        roll	$19, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_27: 0 - 0
        movl	108+L_SM3_AVX1_t(%rip), %edx
        movl	%r13d, %ecx
        addl	%r9d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_27: 1 - 1
        movl	124(%rsp), %eax
        addl	%ecx, %edx
        movl	108(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_27: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_27: 3 - 3
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r10d, %r8d
        movl	%r14d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_27: 4 - 4
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        xorl	%r15d, %ebx
        xorl	%r11d, %r8d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_27: 5 - 5
        andl	%ebx, %r12d
        andl	%r9d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm2
        # iter_27: 6 - 6
        addl	%ecx, %r12d
        addl	%edx, %r8d
        movl	%r8d, %ebx
        roll	$8, %r8d
        # iter_27: 7 - 7
        roll	$9, %r14d
        xorl	%ebx, %r8d
        roll	$19, %r10d
        roll	$9, %r8d
        xorl	%ebx, %r8d
        # msg_sched done: 24-27
        # msg_sched: 28-31
        # iter_28: 0 - 0
        movl	112+L_SM3_AVX1_t(%rip), %edx
        movl	%r12d, %ecx
        addl	%r8d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm3, %xmm0, %xmm5
        # iter_28: 1 - 1
        movl	128(%rsp), %eax
        addl	%ecx, %edx
        movl	112(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm1, %xmm2, %xmm4
        # iter_28: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_28: 3 - 3
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r9d, %r15d
        movl	%r13d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_28: 4 - 4
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        xorl	%r14d, %ebx
        xorl	%r10d, %r15d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_28: 5 - 5
        andl	%ebx, %r11d
        andl	%r8d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        vpalignr	$12, %xmm0, %xmm1, %xmm10
        # iter_28: 6 - 7
        addl	%ecx, %r11d
        addl	%edx, %r15d
        movl	%r15d, %ebx
        roll	$8, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        roll	$9, %r15d
        xorl	%ebx, %r15d
        vpxor	%xmm10, %xmm3, %xmm10
        # iter_29: 0 - 0
        movl	116+L_SM3_AVX1_t(%rip), %edx
        movl	%r11d, %ecx
        addl	%r15d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm2, %xmm4
        # iter_29: 1 - 2
        movl	132(%rsp), %eax
        addl	%ecx, %edx
        movl	116(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_29: 3 - 3
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r8d, %r14d
        movl	%r12d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_29: 4 - 4
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        xorl	%r13d, %ebx
        xorl	%r9d, %r14d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_29: 5 - 5
        andl	%ebx, %r10d
        andl	%r15d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_29: 6 - 6
        addl	%ecx, %r10d
        addl	%edx, %r14d
        movl	%r14d, %ebx
        roll	$8, %r14d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_29: 7 - 7
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        roll	$9, %r14d
        xorl	%ebx, %r14d
        # iter_30: 0 - 0
        movl	120+L_SM3_AVX1_t(%rip), %edx
        movl	%r10d, %ecx
        addl	%r14d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_30: 1 - 1
        movl	136(%rsp), %eax
        addl	%ecx, %edx
        movl	120(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_30: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_30: 3 - 4
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r15d, %r13d
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        xorl	%r8d, %r13d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_30: 5 - 5
        andl	%ebx, %r9d
        andl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_30: 6 - 6
        addl	%ecx, %r9d
        addl	%edx, %r13d
        movl	%r13d, %ebx
        roll	$8, %r13d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_30: 7 - 7
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_31: 0 - 0
        movl	124+L_SM3_AVX1_t(%rip), %edx
        movl	%r9d, %ecx
        addl	%r13d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_31: 1 - 1
        movl	140(%rsp), %eax
        addl	%ecx, %edx
        movl	124(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_31: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_31: 3 - 3
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r14d, %r12d
        movl	%r10d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_31: 4 - 4
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        xorl	%r11d, %ebx
        xorl	%r15d, %r12d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_31: 5 - 5
        andl	%ebx, %r8d
        andl	%r13d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm3
        # iter_31: 6 - 6
        addl	%ecx, %r8d
        addl	%edx, %r12d
        movl	%r12d, %ebx
        roll	$8, %r12d
        # iter_31: 7 - 7
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        roll	$9, %r12d
        xorl	%ebx, %r12d
        # msg_sched done: 28-31
        # x2_to_w: 40
        vmovdqu	%xmm2, 160(%rsp)
        vmovdqu	%xmm3, 176(%rsp)
        # msg_sched: 32-35
        # iter_32: 0 - 0
        movl	128+L_SM3_AVX1_t(%rip), %edx
        movl	%r8d, %ecx
        addl	%r12d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm0, %xmm1, %xmm5
        # iter_32: 1 - 1
        movl	144(%rsp), %eax
        addl	%ecx, %edx
        movl	128(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm2, %xmm3, %xmm4
        # iter_32: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_32: 3 - 3
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r13d, %r11d
        movl	%r9d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_32: 4 - 4
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        xorl	%r10d, %ebx
        xorl	%r14d, %r11d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_32: 5 - 5
        andl	%ebx, %r15d
        andl	%r12d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        vpalignr	$12, %xmm1, %xmm2, %xmm10
        # iter_32: 6 - 7
        addl	%ecx, %r15d
        addl	%edx, %r11d
        movl	%r11d, %ebx
        roll	$8, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r11d
        vpxor	%xmm10, %xmm0, %xmm10
        # iter_33: 0 - 0
        movl	132+L_SM3_AVX1_t(%rip), %edx
        movl	%r15d, %ecx
        addl	%r11d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm3, %xmm4
        # iter_33: 1 - 2
        movl	148(%rsp), %eax
        addl	%ecx, %edx
        movl	132(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_33: 3 - 3
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r12d, %r10d
        movl	%r8d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_33: 4 - 4
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        xorl	%r9d, %ebx
        xorl	%r13d, %r10d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_33: 5 - 5
        andl	%ebx, %r14d
        andl	%r11d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_33: 6 - 6
        addl	%ecx, %r14d
        addl	%edx, %r10d
        movl	%r10d, %ebx
        roll	$8, %r10d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_33: 7 - 7
        roll	$9, %r8d
        xorl	%ebx, %r10d
        roll	$19, %r12d
        roll	$9, %r10d
        xorl	%ebx, %r10d
        # iter_34: 0 - 0
        movl	136+L_SM3_AVX1_t(%rip), %edx
        movl	%r14d, %ecx
        addl	%r10d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_34: 1 - 1
        movl	152(%rsp), %eax
        addl	%ecx, %edx
        movl	136(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_34: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_34: 3 - 4
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r11d, %r9d
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        xorl	%r12d, %r9d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_34: 5 - 5
        andl	%ebx, %r13d
        andl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_34: 6 - 6
        addl	%ecx, %r13d
        addl	%edx, %r9d
        movl	%r9d, %ebx
        roll	$8, %r9d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_34: 7 - 7
        roll	$9, %r15d
        xorl	%ebx, %r9d
        roll	$19, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_35: 0 - 0
        movl	140+L_SM3_AVX1_t(%rip), %edx
        movl	%r13d, %ecx
        addl	%r9d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_35: 1 - 1
        movl	156(%rsp), %eax
        addl	%ecx, %edx
        movl	140(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_35: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_35: 3 - 3
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r10d, %r8d
        movl	%r14d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_35: 4 - 4
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        xorl	%r15d, %ebx
        xorl	%r11d, %r8d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_35: 5 - 5
        andl	%ebx, %r12d
        andl	%r9d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm0
        # iter_35: 6 - 6
        addl	%ecx, %r12d
        addl	%edx, %r8d
        movl	%r8d, %ebx
        roll	$8, %r8d
        # iter_35: 7 - 7
        roll	$9, %r14d
        xorl	%ebx, %r8d
        roll	$19, %r10d
        roll	$9, %r8d
        xorl	%ebx, %r8d
        # msg_sched done: 32-35
        # msg_sched: 36-39
        # iter_36: 0 - 0
        movl	144+L_SM3_AVX1_t(%rip), %edx
        movl	%r12d, %ecx
        addl	%r8d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm1, %xmm2, %xmm5
        # iter_36: 1 - 1
        movl	160(%rsp), %eax
        addl	%ecx, %edx
        movl	144(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm3, %xmm0, %xmm4
        # iter_36: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_36: 3 - 3
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r9d, %r15d
        movl	%r13d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_36: 4 - 4
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        xorl	%r14d, %ebx
        xorl	%r10d, %r15d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_36: 5 - 5
        andl	%ebx, %r11d
        andl	%r8d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        vpalignr	$12, %xmm2, %xmm3, %xmm10
        # iter_36: 6 - 7
        addl	%ecx, %r11d
        addl	%edx, %r15d
        movl	%r15d, %ebx
        roll	$8, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        roll	$9, %r15d
        xorl	%ebx, %r15d
        vpxor	%xmm10, %xmm1, %xmm10
        # iter_37: 0 - 0
        movl	148+L_SM3_AVX1_t(%rip), %edx
        movl	%r11d, %ecx
        addl	%r15d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm0, %xmm4
        # iter_37: 1 - 2
        movl	164(%rsp), %eax
        addl	%ecx, %edx
        movl	148(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_37: 3 - 3
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r8d, %r14d
        movl	%r12d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_37: 4 - 4
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        xorl	%r13d, %ebx
        xorl	%r9d, %r14d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_37: 5 - 5
        andl	%ebx, %r10d
        andl	%r15d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_37: 6 - 6
        addl	%ecx, %r10d
        addl	%edx, %r14d
        movl	%r14d, %ebx
        roll	$8, %r14d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_37: 7 - 7
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        roll	$9, %r14d
        xorl	%ebx, %r14d
        # iter_38: 0 - 0
        movl	152+L_SM3_AVX1_t(%rip), %edx
        movl	%r10d, %ecx
        addl	%r14d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_38: 1 - 1
        movl	168(%rsp), %eax
        addl	%ecx, %edx
        movl	152(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_38: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_38: 3 - 4
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r15d, %r13d
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        xorl	%r8d, %r13d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_38: 5 - 5
        andl	%ebx, %r9d
        andl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_38: 6 - 6
        addl	%ecx, %r9d
        addl	%edx, %r13d
        movl	%r13d, %ebx
        roll	$8, %r13d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_38: 7 - 7
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_39: 0 - 0
        movl	156+L_SM3_AVX1_t(%rip), %edx
        movl	%r9d, %ecx
        addl	%r13d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_39: 1 - 1
        movl	172(%rsp), %eax
        addl	%ecx, %edx
        movl	156(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_39: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_39: 3 - 3
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r14d, %r12d
        movl	%r10d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_39: 4 - 4
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        xorl	%r11d, %ebx
        xorl	%r15d, %r12d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_39: 5 - 5
        andl	%ebx, %r8d
        andl	%r13d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm1
        # iter_39: 6 - 6
        addl	%ecx, %r8d
        addl	%edx, %r12d
        movl	%r12d, %ebx
        roll	$8, %r12d
        # iter_39: 7 - 7
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        roll	$9, %r12d
        xorl	%ebx, %r12d
        # msg_sched done: 36-39
        # x2_to_w: 48
        vmovdqu	%xmm0, 192(%rsp)
        vmovdqu	%xmm1, 208(%rsp)
        # msg_sched: 40-43
        # iter_40: 0 - 0
        movl	160+L_SM3_AVX1_t(%rip), %edx
        movl	%r8d, %ecx
        addl	%r12d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm2, %xmm3, %xmm5
        # iter_40: 1 - 1
        movl	176(%rsp), %eax
        addl	%ecx, %edx
        movl	160(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm0, %xmm1, %xmm4
        # iter_40: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_40: 3 - 3
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r13d, %r11d
        movl	%r9d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_40: 4 - 4
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        xorl	%r10d, %ebx
        xorl	%r14d, %r11d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_40: 5 - 5
        andl	%ebx, %r15d
        andl	%r12d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        vpalignr	$12, %xmm3, %xmm0, %xmm10
        # iter_40: 6 - 7
        addl	%ecx, %r15d
        addl	%edx, %r11d
        movl	%r11d, %ebx
        roll	$8, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r11d
        vpxor	%xmm10, %xmm2, %xmm10
        # iter_41: 0 - 0
        movl	164+L_SM3_AVX1_t(%rip), %edx
        movl	%r15d, %ecx
        addl	%r11d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm1, %xmm4
        # iter_41: 1 - 2
        movl	180(%rsp), %eax
        addl	%ecx, %edx
        movl	164(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_41: 3 - 3
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r12d, %r10d
        movl	%r8d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_41: 4 - 4
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        xorl	%r9d, %ebx
        xorl	%r13d, %r10d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_41: 5 - 5
        andl	%ebx, %r14d
        andl	%r11d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_41: 6 - 6
        addl	%ecx, %r14d
        addl	%edx, %r10d
        movl	%r10d, %ebx
        roll	$8, %r10d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_41: 7 - 7
        roll	$9, %r8d
        xorl	%ebx, %r10d
        roll	$19, %r12d
        roll	$9, %r10d
        xorl	%ebx, %r10d
        # iter_42: 0 - 0
        movl	168+L_SM3_AVX1_t(%rip), %edx
        movl	%r14d, %ecx
        addl	%r10d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_42: 1 - 1
        movl	184(%rsp), %eax
        addl	%ecx, %edx
        movl	168(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_42: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_42: 3 - 4
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r11d, %r9d
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        xorl	%r12d, %r9d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_42: 5 - 5
        andl	%ebx, %r13d
        andl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_42: 6 - 6
        addl	%ecx, %r13d
        addl	%edx, %r9d
        movl	%r9d, %ebx
        roll	$8, %r9d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_42: 7 - 7
        roll	$9, %r15d
        xorl	%ebx, %r9d
        roll	$19, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_43: 0 - 0
        movl	172+L_SM3_AVX1_t(%rip), %edx
        movl	%r13d, %ecx
        addl	%r9d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_43: 1 - 1
        movl	188(%rsp), %eax
        addl	%ecx, %edx
        movl	172(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_43: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_43: 3 - 3
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r10d, %r8d
        movl	%r14d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_43: 4 - 4
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        xorl	%r15d, %ebx
        xorl	%r11d, %r8d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_43: 5 - 5
        andl	%ebx, %r12d
        andl	%r9d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm2
        # iter_43: 6 - 6
        addl	%ecx, %r12d
        addl	%edx, %r8d
        movl	%r8d, %ebx
        roll	$8, %r8d
        # iter_43: 7 - 7
        roll	$9, %r14d
        xorl	%ebx, %r8d
        roll	$19, %r10d
        roll	$9, %r8d
        xorl	%ebx, %r8d
        # msg_sched done: 40-43
        # msg_sched: 44-47
        # iter_44: 0 - 0
        movl	176+L_SM3_AVX1_t(%rip), %edx
        movl	%r12d, %ecx
        addl	%r8d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm3, %xmm0, %xmm5
        # iter_44: 1 - 1
        movl	192(%rsp), %eax
        addl	%ecx, %edx
        movl	176(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm1, %xmm2, %xmm4
        # iter_44: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_44: 3 - 3
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r9d, %r15d
        movl	%r13d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_44: 4 - 4
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        xorl	%r14d, %ebx
        xorl	%r10d, %r15d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_44: 5 - 5
        andl	%ebx, %r11d
        andl	%r8d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        vpalignr	$12, %xmm0, %xmm1, %xmm10
        # iter_44: 6 - 7
        addl	%ecx, %r11d
        addl	%edx, %r15d
        movl	%r15d, %ebx
        roll	$8, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        roll	$9, %r15d
        xorl	%ebx, %r15d
        vpxor	%xmm10, %xmm3, %xmm10
        # iter_45: 0 - 0
        movl	180+L_SM3_AVX1_t(%rip), %edx
        movl	%r11d, %ecx
        addl	%r15d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm2, %xmm4
        # iter_45: 1 - 2
        movl	196(%rsp), %eax
        addl	%ecx, %edx
        movl	180(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_45: 3 - 3
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r8d, %r14d
        movl	%r12d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_45: 4 - 4
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        xorl	%r13d, %ebx
        xorl	%r9d, %r14d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_45: 5 - 5
        andl	%ebx, %r10d
        andl	%r15d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_45: 6 - 6
        addl	%ecx, %r10d
        addl	%edx, %r14d
        movl	%r14d, %ebx
        roll	$8, %r14d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_45: 7 - 7
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        roll	$9, %r14d
        xorl	%ebx, %r14d
        # iter_46: 0 - 0
        movl	184+L_SM3_AVX1_t(%rip), %edx
        movl	%r10d, %ecx
        addl	%r14d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_46: 1 - 1
        movl	200(%rsp), %eax
        addl	%ecx, %edx
        movl	184(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_46: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_46: 3 - 4
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r15d, %r13d
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        xorl	%r8d, %r13d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_46: 5 - 5
        andl	%ebx, %r9d
        andl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_46: 6 - 6
        addl	%ecx, %r9d
        addl	%edx, %r13d
        movl	%r13d, %ebx
        roll	$8, %r13d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_46: 7 - 7
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_47: 0 - 0
        movl	188+L_SM3_AVX1_t(%rip), %edx
        movl	%r9d, %ecx
        addl	%r13d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_47: 1 - 1
        movl	204(%rsp), %eax
        addl	%ecx, %edx
        movl	188(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_47: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_47: 3 - 3
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r14d, %r12d
        movl	%r10d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_47: 4 - 4
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        xorl	%r11d, %ebx
        xorl	%r15d, %r12d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_47: 5 - 5
        andl	%ebx, %r8d
        andl	%r13d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm3
        # iter_47: 6 - 6
        addl	%ecx, %r8d
        addl	%edx, %r12d
        movl	%r12d, %ebx
        roll	$8, %r12d
        # iter_47: 7 - 7
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        roll	$9, %r12d
        xorl	%ebx, %r12d
        # msg_sched done: 44-47
        # x2_to_w: 56
        vmovdqu	%xmm2, 224(%rsp)
        vmovdqu	%xmm3, 240(%rsp)
        # msg_sched: 48-51
        # iter_48: 0 - 0
        movl	192+L_SM3_AVX1_t(%rip), %edx
        movl	%r8d, %ecx
        addl	%r12d, %edx
        roll	$12, %ecx
        vpalignr	$12, %xmm0, %xmm1, %xmm5
        # iter_48: 1 - 1
        movl	208(%rsp), %eax
        addl	%ecx, %edx
        movl	192(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm2, %xmm3, %xmm4
        # iter_48: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_48: 3 - 3
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r13d, %r11d
        movl	%r9d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_48: 4 - 4
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        xorl	%r10d, %ebx
        xorl	%r14d, %r11d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_48: 5 - 5
        andl	%ebx, %r15d
        andl	%r12d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        vpalignr	$12, %xmm1, %xmm2, %xmm10
        # iter_48: 6 - 7
        addl	%ecx, %r15d
        addl	%edx, %r11d
        movl	%r11d, %ebx
        roll	$8, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r11d
        vpxor	%xmm10, %xmm0, %xmm10
        # iter_49: 0 - 0
        movl	196+L_SM3_AVX1_t(%rip), %edx
        movl	%r15d, %ecx
        addl	%r11d, %edx
        roll	$12, %ecx
        vpshufd	$0xf9, %xmm3, %xmm4
        # iter_49: 1 - 2
        movl	212(%rsp), %eax
        addl	%ecx, %edx
        movl	196(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_49: 3 - 3
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r12d, %r10d
        movl	%r8d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_49: 4 - 4
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        xorl	%r9d, %ebx
        xorl	%r13d, %r10d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_49: 5 - 5
        andl	%ebx, %r14d
        andl	%r11d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_49: 6 - 6
        addl	%ecx, %r14d
        addl	%edx, %r10d
        movl	%r10d, %ebx
        roll	$8, %r10d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_49: 7 - 7
        roll	$9, %r8d
        xorl	%ebx, %r10d
        roll	$19, %r12d
        roll	$9, %r10d
        xorl	%ebx, %r10d
        # iter_50: 0 - 0
        movl	200+L_SM3_AVX1_t(%rip), %edx
        movl	%r14d, %ecx
        addl	%r10d, %edx
        roll	$12, %ecx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_50: 1 - 1
        movl	216(%rsp), %eax
        addl	%ecx, %edx
        movl	200(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_50: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_50: 3 - 4
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r11d, %r9d
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        xorl	%r12d, %r9d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_50: 5 - 5
        andl	%ebx, %r13d
        andl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_50: 6 - 6
        addl	%ecx, %r13d
        addl	%edx, %r9d
        movl	%r9d, %ebx
        roll	$8, %r9d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_50: 7 - 7
        roll	$9, %r15d
        xorl	%ebx, %r9d
        roll	$19, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_51: 0 - 0
        movl	204+L_SM3_AVX1_t(%rip), %edx
        movl	%r13d, %ecx
        addl	%r9d, %edx
        roll	$12, %ecx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_51: 1 - 1
        movl	220(%rsp), %eax
        addl	%ecx, %edx
        movl	204(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_51: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_51: 3 - 3
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r10d, %r8d
        movl	%r14d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_51: 4 - 4
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        xorl	%r15d, %ebx
        xorl	%r11d, %r8d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_51: 5 - 5
        andl	%ebx, %r12d
        andl	%r9d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm0
        # iter_51: 6 - 6
        addl	%ecx, %r12d
        addl	%edx, %r8d
        movl	%r8d, %ebx
        roll	$8, %r8d
        # iter_51: 7 - 7
        roll	$9, %r14d
        xorl	%ebx, %r8d
        roll	$19, %r10d
        roll	$9, %r8d
        xorl	%ebx, %r8d
        # msg_sched done: 48-51
        # iter_52: 0 - 7
        movl	208+L_SM3_AVX1_t(%rip), %edx
        movl	%r12d, %ecx
        addl	%r8d, %edx
        roll	$12, %ecx
        movl	224(%rsp), %eax
        addl	%ecx, %edx
        movl	208(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r9d, %r15d
        movl	%r13d, %r11d
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        xorl	%r14d, %ebx
        xorl	%r10d, %r15d
        andl	%ebx, %r11d
        andl	%r8d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        addl	%ecx, %r11d
        addl	%edx, %r15d
        movl	%r15d, %ebx
        roll	$8, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        roll	$9, %r15d
        xorl	%ebx, %r15d
        # iter_53: 0 - 7
        movl	212+L_SM3_AVX1_t(%rip), %edx
        movl	%r11d, %ecx
        addl	%r15d, %edx
        roll	$12, %ecx
        movl	228(%rsp), %eax
        addl	%ecx, %edx
        movl	212(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r8d, %r14d
        movl	%r12d, %r10d
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        xorl	%r13d, %ebx
        xorl	%r9d, %r14d
        andl	%ebx, %r10d
        andl	%r15d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        addl	%ecx, %r10d
        addl	%edx, %r14d
        movl	%r14d, %ebx
        roll	$8, %r14d
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        roll	$9, %r14d
        xorl	%ebx, %r14d
        # iter_54: 0 - 7
        movl	216+L_SM3_AVX1_t(%rip), %edx
        movl	%r10d, %ecx
        addl	%r14d, %edx
        roll	$12, %ecx
        movl	232(%rsp), %eax
        addl	%ecx, %edx
        movl	216(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r15d, %r13d
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        xorl	%r8d, %r13d
        andl	%ebx, %r9d
        andl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        addl	%ecx, %r9d
        addl	%edx, %r13d
        movl	%r13d, %ebx
        roll	$8, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r13d
        # iter_55: 0 - 7
        movl	220+L_SM3_AVX1_t(%rip), %edx
        movl	%r9d, %ecx
        addl	%r13d, %edx
        roll	$12, %ecx
        movl	236(%rsp), %eax
        addl	%ecx, %edx
        movl	220(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r14d, %r12d
        movl	%r10d, %r8d
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        xorl	%r11d, %ebx
        xorl	%r15d, %r12d
        andl	%ebx, %r8d
        andl	%r13d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        addl	%ecx, %r8d
        addl	%edx, %r12d
        movl	%r12d, %ebx
        roll	$8, %r12d
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        roll	$9, %r12d
        xorl	%ebx, %r12d
        # x0_to_w: 64
        vmovdqu	%xmm0, 256(%rsp)
        # iter_56: 0 - 7
        movl	224+L_SM3_AVX1_t(%rip), %edx
        movl	%r8d, %ecx
        addl	%r12d, %edx
        roll	$12, %ecx
        movl	240(%rsp), %eax
        addl	%ecx, %edx
        movl	224(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r13d, %r11d
        movl	%r9d, %r15d
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        xorl	%r10d, %ebx
        xorl	%r14d, %r11d
        andl	%ebx, %r15d
        andl	%r12d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        addl	%ecx, %r15d
        addl	%edx, %r11d
        movl	%r11d, %ebx
        roll	$8, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r11d
        # iter_57: 0 - 7
        movl	228+L_SM3_AVX1_t(%rip), %edx
        movl	%r15d, %ecx
        addl	%r11d, %edx
        roll	$12, %ecx
        movl	244(%rsp), %eax
        addl	%ecx, %edx
        movl	228(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r12d, %r10d
        movl	%r8d, %r14d
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        xorl	%r9d, %ebx
        xorl	%r13d, %r10d
        andl	%ebx, %r14d
        andl	%r11d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        addl	%ecx, %r14d
        addl	%edx, %r10d
        movl	%r10d, %ebx
        roll	$8, %r10d
        roll	$9, %r8d
        xorl	%ebx, %r10d
        roll	$19, %r12d
        roll	$9, %r10d
        xorl	%ebx, %r10d
        # iter_58: 0 - 7
        movl	232+L_SM3_AVX1_t(%rip), %edx
        movl	%r14d, %ecx
        addl	%r10d, %edx
        roll	$12, %ecx
        movl	248(%rsp), %eax
        addl	%ecx, %edx
        movl	232(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r11d, %r9d
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        xorl	%r12d, %r9d
        andl	%ebx, %r13d
        andl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        addl	%ecx, %r13d
        addl	%edx, %r9d
        movl	%r9d, %ebx
        roll	$8, %r9d
        roll	$9, %r15d
        xorl	%ebx, %r9d
        roll	$19, %r11d
        roll	$9, %r9d
        xorl	%ebx, %r9d
        # iter_59: 0 - 7
        movl	236+L_SM3_AVX1_t(%rip), %edx
        movl	%r13d, %ecx
        addl	%r9d, %edx
        roll	$12, %ecx
        movl	252(%rsp), %eax
        addl	%ecx, %edx
        movl	236(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r10d, %r8d
        movl	%r14d, %r12d
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        xorl	%r15d, %ebx
        xorl	%r11d, %r8d
        andl	%ebx, %r12d
        andl	%r9d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        addl	%ecx, %r12d
        addl	%edx, %r8d
        movl	%r8d, %ebx
        roll	$8, %r8d
        roll	$9, %r14d
        xorl	%ebx, %r8d
        roll	$19, %r10d
        roll	$9, %r8d
        xorl	%ebx, %r8d
        # iter_60: 0 - 7
        movl	240+L_SM3_AVX1_t(%rip), %edx
        movl	%r12d, %ecx
        addl	%r8d, %edx
        roll	$12, %ecx
        movl	256(%rsp), %eax
        addl	%ecx, %edx
        movl	240(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r9d, %r15d
        movl	%r13d, %r11d
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        xorl	%r14d, %ebx
        xorl	%r10d, %r15d
        andl	%ebx, %r11d
        andl	%r8d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        addl	%ecx, %r11d
        addl	%edx, %r15d
        movl	%r15d, %ebx
        roll	$8, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        roll	$9, %r15d
        xorl	%ebx, %r15d
        # iter_61: 0 - 7
        movl	244+L_SM3_AVX1_t(%rip), %edx
        movl	%r11d, %ecx
        addl	%r15d, %edx
        roll	$12, %ecx
        movl	260(%rsp), %eax
        addl	%ecx, %edx
        movl	244(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r8d, %r14d
        movl	%r12d, %r10d
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        xorl	%r13d, %ebx
        xorl	%r9d, %r14d
        andl	%ebx, %r10d
        andl	%r15d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        addl	%ecx, %r10d
        addl	%edx, %r14d
        movl	%r14d, %ebx
        roll	$8, %r14d
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        roll	$9, %r14d
        xorl	%ebx, %r14d
        # iter_62: 0 - 7
        movl	248+L_SM3_AVX1_t(%rip), %edx
        movl	%r10d, %ecx
        addl	%r14d, %edx
        roll	$12, %ecx
        movl	264(%rsp), %eax
        addl	%ecx, %edx
        movl	248(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r15d, %r13d
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        xorl	%r8d, %r13d
        andl	%ebx, %r9d
        andl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        addl	%ecx, %r9d
        addl	%edx, %r13d
        movl	%r13d, %ebx
        roll	$8, %r13d
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        roll	$9, %r13d
        xorl	%ebx, %r13d
        # iter_63: 0 - 7
        movl	252+L_SM3_AVX1_t(%rip), %edx
        movl	%r9d, %ecx
        addl	%r13d, %edx
        roll	$12, %ecx
        movl	268(%rsp), %eax
        addl	%ecx, %edx
        movl	252(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r14d, %r12d
        movl	%r10d, %r8d
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        xorl	%r11d, %ebx
        xorl	%r15d, %r12d
        andl	%ebx, %r8d
        andl	%r13d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        addl	%ecx, %r8d
        addl	%edx, %r12d
        movl	%r12d, %ebx
        roll	$8, %r12d
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        roll	$9, %r12d
        xorl	%ebx, %r12d
        xorl	(%rdi), %r8d
        xorl	4(%rdi), %r9d
        xorl	8(%rdi), %r10d
        xorl	12(%rdi), %r11d
        xorl	16(%rdi), %r12d
        xorl	20(%rdi), %r13d
        xorl	24(%rdi), %r14d
        xorl	28(%rdi), %r15d
        addq	$0x40, %rbp
        subl	$0x40, %esi
        movl	%r8d, (%rdi)
        movl	%r9d, 4(%rdi)
        movl	%r10d, 8(%rdi)
        movl	%r11d, 12(%rdi)
        movl	%r12d, 16(%rdi)
        movl	%r13d, 20(%rdi)
        movl	%r14d, 24(%rdi)
        movl	%r15d, 28(%rdi)
        jnz	L_SM3_AVX1len_start
        xorq	%rax, %rax
        vzeroupper
        addq	$0x110, %rsp
        popq	%rbp
        popq	%r15
        popq	%r14
        popq	%r13
        popq	%r12
        popq	%rbx
        repz retq
#ifndef __APPLE__
.size	sm3_compress_len_avx1,.-sm3_compress_len_avx1
#endif /* __APPLE__ */
#ifndef __APPLE__
.data
#else
.section	__DATA,__data
#endif /* __APPLE__ */
L_SM3_AVX1_RORX_t:
.long	0x79cc4519,0xf3988a32,0xe7311465,0xce6228cb
.long	0x9cc45197,0x3988a32f,0x7311465e,0xe6228cbc
.long	0xcc451979,0x988a32f3,0x311465e7,0x6228cbce
.long	0xc451979c,0x88a32f39,0x11465e73,0x228cbce6
.long	0x9d8a7a87,0x3b14f50f,0x7629ea1e,0xec53d43c
.long	0xd8a7a879,0xb14f50f3,0x629ea1e7,0xc53d43ce
.long	0x8a7a879d,0x14f50f3b,0x29ea1e76,0x53d43cec
.long	0xa7a879d8,0x4f50f3b1,0x9ea1e762,0x3d43cec5
.long	0x7a879d8a,0xf50f3b14,0xea1e7629,0xd43cec53
.long	0xa879d8a7,0x50f3b14f,0xa1e7629e,0x43cec53d
.long	0x879d8a7a,0xf3b14f5,0x1e7629ea,0x3cec53d4
.long	0x79d8a7a8,0xf3b14f50,0xe7629ea1,0xcec53d43
.long	0x9d8a7a87,0x3b14f50f,0x7629ea1e,0xec53d43c
.long	0xd8a7a879,0xb14f50f3,0x629ea1e7,0xc53d43ce
.long	0x8a7a879d,0x14f50f3b,0x29ea1e76,0x53d43cec
.long	0xa7a879d8,0x4f50f3b1,0x9ea1e762,0x3d43cec5
#ifndef __APPLE__
.data
#else
.section	__DATA,__data
#endif /* __APPLE__ */
#ifndef __APPLE__
.align	16
#else
.p2align	4
#endif /* __APPLE__ */
L_SM3_AVX1_RORX_flip_mask:
.quad	0x405060700010203, 0xc0d0e0f08090a0b
#ifndef __APPLE__
.text
.globl	sm3_compress_avx1_rorx
.type	sm3_compress_avx1_rorx,@function
.align	16
sm3_compress_avx1_rorx:
#else
.section	__TEXT,__text
.globl	_sm3_compress_avx1_rorx
.p2align	4
_sm3_compress_avx1_rorx:
#endif /* __APPLE__ */
        pushq	%rbx
        pushq	%r12
        pushq	%r13
        pushq	%r14
        pushq	%r15
        subq	$0x110, %rsp
        leaq	32(%rdi), %rax
        vmovdqa	L_SM3_AVX1_RORX_flip_mask(%rip), %xmm11
        movl	(%rdi), %r8d
        movl	4(%rdi), %r9d
        movl	8(%rdi), %r10d
        movl	12(%rdi), %r11d
        movl	16(%rdi), %r12d
        movl	20(%rdi), %r13d
        movl	24(%rdi), %r14d
        movl	28(%rdi), %r15d
        # X0, X1, X2, X3 = W[0..15]
        vmovdqu	(%rax), %xmm0
        vmovdqu	16(%rax), %xmm1
        vmovdqu	32(%rax), %xmm2
        vmovdqu	48(%rax), %xmm3
        # x_to_w: 0
        vmovdqu	%xmm0, (%rsp)
        vmovdqu	%xmm1, 16(%rsp)
        vmovdqu	%xmm2, 32(%rsp)
        vmovdqu	%xmm3, 48(%rsp)
        # msg_sched: 0-3
        # iter_0: 0 - 0
        movl	0+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r8d, %ecx
        addl	%r12d, %edx
        vpalignr	$12, %xmm0, %xmm1, %xmm5
        # iter_0: 1 - 1
        movl	16(%rsp), %eax
        addl	%ecx, %edx
        movl	(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm2, %xmm3, %xmm4
        # iter_0: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_0: 3 - 3
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r8d, %r15d
        movl	%r12d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_0: 4 - 4
        xorl	%r9d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        xorl	%r14d, %r11d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_0: 5 - 5
        addl	%ecx, %r15d
        addl	%edx, %r11d
        rorxl	$15, %r11d, %ebx
        vpalignr	$12, %xmm1, %xmm2, %xmm10
        # iter_0: 6 - 7
        rorxl	$23, %r11d, %eax
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        xorl	%eax, %r11d
        vpxor	%xmm10, %xmm0, %xmm10
        # iter_1: 0 - 0
        movl	4+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r15d, %ecx
        addl	%r11d, %edx
        vpshufd	$0xf9, %xmm3, %xmm4
        # iter_1: 1 - 2
        movl	20(%rsp), %eax
        addl	%ecx, %edx
        movl	4(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_1: 3 - 3
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r15d, %r14d
        movl	%r11d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_1: 4 - 4
        xorl	%r8d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        xorl	%r13d, %r10d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_1: 5 - 5
        addl	%ecx, %r14d
        addl	%edx, %r10d
        rorxl	$15, %r10d, %ebx
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_1: 6 - 6
        rorxl	$23, %r10d, %eax
        roll	$9, %r8d
        xorl	%ebx, %r10d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_1: 7 - 7
        roll	$19, %r12d
        xorl	%eax, %r10d
        # iter_2: 0 - 0
        movl	8+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r14d, %ecx
        addl	%r10d, %edx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_2: 1 - 1
        movl	24(%rsp), %eax
        addl	%ecx, %edx
        movl	8(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_2: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_2: 3 - 4
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r14d, %r13d
        movl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        xorl	%r12d, %r9d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_2: 5 - 5
        addl	%ecx, %r13d
        addl	%edx, %r9d
        rorxl	$15, %r9d, %ebx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_2: 6 - 6
        rorxl	$23, %r9d, %eax
        roll	$9, %r15d
        xorl	%ebx, %r9d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_2: 7 - 7
        roll	$19, %r11d
        xorl	%eax, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_3: 0 - 0
        movl	12+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r13d, %ecx
        addl	%r9d, %edx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_3: 1 - 1
        movl	28(%rsp), %eax
        addl	%ecx, %edx
        movl	12(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_3: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_3: 3 - 3
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r13d, %r12d
        movl	%r9d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_3: 4 - 4
        xorl	%r14d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        xorl	%r11d, %r8d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_3: 5 - 5
        addl	%ecx, %r12d
        addl	%edx, %r8d
        rorxl	$15, %r8d, %ebx
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm0
        # iter_3: 6 - 6
        rorxl	$23, %r8d, %eax
        roll	$9, %r14d
        xorl	%ebx, %r8d
        # iter_3: 7 - 7
        roll	$19, %r10d
        xorl	%eax, %r8d
        # msg_sched done: 0-3
        # msg_sched: 4-7
        # iter_4: 0 - 0
        movl	16+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r12d, %ecx
        addl	%r8d, %edx
        vpalignr	$12, %xmm1, %xmm2, %xmm5
        # iter_4: 1 - 1
        movl	32(%rsp), %eax
        addl	%ecx, %edx
        movl	16(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm3, %xmm0, %xmm4
        # iter_4: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_4: 3 - 3
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r12d, %r11d
        movl	%r8d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_4: 4 - 4
        xorl	%r13d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        xorl	%r10d, %r15d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_4: 5 - 5
        addl	%ecx, %r11d
        addl	%edx, %r15d
        rorxl	$15, %r15d, %ebx
        vpalignr	$12, %xmm2, %xmm3, %xmm10
        # iter_4: 6 - 7
        rorxl	$23, %r15d, %eax
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        xorl	%eax, %r15d
        vpxor	%xmm10, %xmm1, %xmm10
        # iter_5: 0 - 0
        movl	20+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r11d, %ecx
        addl	%r15d, %edx
        vpshufd	$0xf9, %xmm0, %xmm4
        # iter_5: 1 - 2
        movl	36(%rsp), %eax
        addl	%ecx, %edx
        movl	20(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_5: 3 - 3
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r11d, %r10d
        movl	%r15d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_5: 4 - 4
        xorl	%r12d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        xorl	%r9d, %r14d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_5: 5 - 5
        addl	%ecx, %r10d
        addl	%edx, %r14d
        rorxl	$15, %r14d, %ebx
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_5: 6 - 6
        rorxl	$23, %r14d, %eax
        roll	$9, %r12d
        xorl	%ebx, %r14d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_5: 7 - 7
        roll	$19, %r8d
        xorl	%eax, %r14d
        # iter_6: 0 - 0
        movl	24+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r10d, %ecx
        addl	%r14d, %edx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_6: 1 - 1
        movl	40(%rsp), %eax
        addl	%ecx, %edx
        movl	24(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_6: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_6: 3 - 4
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r10d, %r9d
        movl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        xorl	%r8d, %r13d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_6: 5 - 5
        addl	%ecx, %r9d
        addl	%edx, %r13d
        rorxl	$15, %r13d, %ebx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_6: 6 - 6
        rorxl	$23, %r13d, %eax
        roll	$9, %r11d
        xorl	%ebx, %r13d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_6: 7 - 7
        roll	$19, %r15d
        xorl	%eax, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_7: 0 - 0
        movl	28+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r9d, %ecx
        addl	%r13d, %edx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_7: 1 - 1
        movl	44(%rsp), %eax
        addl	%ecx, %edx
        movl	28(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_7: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_7: 3 - 3
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r9d, %r8d
        movl	%r13d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_7: 4 - 4
        xorl	%r10d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        xorl	%r15d, %r12d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_7: 5 - 5
        addl	%ecx, %r8d
        addl	%edx, %r12d
        rorxl	$15, %r12d, %ebx
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm1
        # iter_7: 6 - 6
        rorxl	$23, %r12d, %eax
        roll	$9, %r10d
        xorl	%ebx, %r12d
        # iter_7: 7 - 7
        roll	$19, %r14d
        xorl	%eax, %r12d
        # msg_sched done: 4-7
        # x2_to_w: 16
        vmovdqu	%xmm0, 64(%rsp)
        vmovdqu	%xmm1, 80(%rsp)
        # msg_sched: 8-11
        # iter_8: 0 - 0
        movl	32+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r8d, %ecx
        addl	%r12d, %edx
        vpalignr	$12, %xmm2, %xmm3, %xmm5
        # iter_8: 1 - 1
        movl	48(%rsp), %eax
        addl	%ecx, %edx
        movl	32(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm0, %xmm1, %xmm4
        # iter_8: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_8: 3 - 3
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r8d, %r15d
        movl	%r12d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_8: 4 - 4
        xorl	%r9d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        xorl	%r14d, %r11d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_8: 5 - 5
        addl	%ecx, %r15d
        addl	%edx, %r11d
        rorxl	$15, %r11d, %ebx
        vpalignr	$12, %xmm3, %xmm0, %xmm10
        # iter_8: 6 - 7
        rorxl	$23, %r11d, %eax
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        xorl	%eax, %r11d
        vpxor	%xmm10, %xmm2, %xmm10
        # iter_9: 0 - 0
        movl	36+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r15d, %ecx
        addl	%r11d, %edx
        vpshufd	$0xf9, %xmm1, %xmm4
        # iter_9: 1 - 2
        movl	52(%rsp), %eax
        addl	%ecx, %edx
        movl	36(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_9: 3 - 3
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r15d, %r14d
        movl	%r11d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_9: 4 - 4
        xorl	%r8d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        xorl	%r13d, %r10d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_9: 5 - 5
        addl	%ecx, %r14d
        addl	%edx, %r10d
        rorxl	$15, %r10d, %ebx
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_9: 6 - 6
        rorxl	$23, %r10d, %eax
        roll	$9, %r8d
        xorl	%ebx, %r10d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_9: 7 - 7
        roll	$19, %r12d
        xorl	%eax, %r10d
        # iter_10: 0 - 0
        movl	40+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r14d, %ecx
        addl	%r10d, %edx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_10: 1 - 1
        movl	56(%rsp), %eax
        addl	%ecx, %edx
        movl	40(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_10: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_10: 3 - 4
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r14d, %r13d
        movl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        xorl	%r12d, %r9d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_10: 5 - 5
        addl	%ecx, %r13d
        addl	%edx, %r9d
        rorxl	$15, %r9d, %ebx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_10: 6 - 6
        rorxl	$23, %r9d, %eax
        roll	$9, %r15d
        xorl	%ebx, %r9d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_10: 7 - 7
        roll	$19, %r11d
        xorl	%eax, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_11: 0 - 0
        movl	44+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r13d, %ecx
        addl	%r9d, %edx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_11: 1 - 1
        movl	60(%rsp), %eax
        addl	%ecx, %edx
        movl	44(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_11: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_11: 3 - 3
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r13d, %r12d
        movl	%r9d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_11: 4 - 4
        xorl	%r14d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        xorl	%r11d, %r8d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_11: 5 - 5
        addl	%ecx, %r12d
        addl	%edx, %r8d
        rorxl	$15, %r8d, %ebx
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm2
        # iter_11: 6 - 6
        rorxl	$23, %r8d, %eax
        roll	$9, %r14d
        xorl	%ebx, %r8d
        # iter_11: 7 - 7
        roll	$19, %r10d
        xorl	%eax, %r8d
        # msg_sched done: 8-11
        # msg_sched: 12-15
        # iter_12: 0 - 0
        movl	48+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r12d, %ecx
        addl	%r8d, %edx
        vpalignr	$12, %xmm3, %xmm0, %xmm5
        # iter_12: 1 - 1
        movl	64(%rsp), %eax
        addl	%ecx, %edx
        movl	48(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm1, %xmm2, %xmm4
        # iter_12: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_12: 3 - 3
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r12d, %r11d
        movl	%r8d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_12: 4 - 4
        xorl	%r13d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        xorl	%r10d, %r15d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_12: 5 - 5
        addl	%ecx, %r11d
        addl	%edx, %r15d
        rorxl	$15, %r15d, %ebx
        vpalignr	$12, %xmm0, %xmm1, %xmm10
        # iter_12: 6 - 7
        rorxl	$23, %r15d, %eax
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        xorl	%eax, %r15d
        vpxor	%xmm10, %xmm3, %xmm10
        # iter_13: 0 - 0
        movl	52+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r11d, %ecx
        addl	%r15d, %edx
        vpshufd	$0xf9, %xmm2, %xmm4
        # iter_13: 1 - 2
        movl	68(%rsp), %eax
        addl	%ecx, %edx
        movl	52(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_13: 3 - 3
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r11d, %r10d
        movl	%r15d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_13: 4 - 4
        xorl	%r12d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        xorl	%r9d, %r14d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_13: 5 - 5
        addl	%ecx, %r10d
        addl	%edx, %r14d
        rorxl	$15, %r14d, %ebx
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_13: 6 - 6
        rorxl	$23, %r14d, %eax
        roll	$9, %r12d
        xorl	%ebx, %r14d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_13: 7 - 7
        roll	$19, %r8d
        xorl	%eax, %r14d
        # iter_14: 0 - 0
        movl	56+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r10d, %ecx
        addl	%r14d, %edx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_14: 1 - 1
        movl	72(%rsp), %eax
        addl	%ecx, %edx
        movl	56(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_14: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_14: 3 - 4
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r10d, %r9d
        movl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        xorl	%r8d, %r13d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_14: 5 - 5
        addl	%ecx, %r9d
        addl	%edx, %r13d
        rorxl	$15, %r13d, %ebx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_14: 6 - 6
        rorxl	$23, %r13d, %eax
        roll	$9, %r11d
        xorl	%ebx, %r13d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_14: 7 - 7
        roll	$19, %r15d
        xorl	%eax, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_15: 0 - 0
        movl	60+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r9d, %ecx
        addl	%r13d, %edx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_15: 1 - 1
        movl	76(%rsp), %eax
        addl	%ecx, %edx
        movl	60(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_15: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_15: 3 - 3
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r9d, %r8d
        movl	%r13d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_15: 4 - 4
        xorl	%r10d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        xorl	%r15d, %r12d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_15: 5 - 5
        addl	%ecx, %r8d
        addl	%edx, %r12d
        rorxl	$15, %r12d, %ebx
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm3
        # iter_15: 6 - 6
        rorxl	$23, %r12d, %eax
        roll	$9, %r10d
        xorl	%ebx, %r12d
        # iter_15: 7 - 7
        roll	$19, %r14d
        xorl	%eax, %r12d
        # msg_sched done: 12-15
        # x2_to_w: 24
        vmovdqu	%xmm2, 96(%rsp)
        vmovdqu	%xmm3, 112(%rsp)
        # msg_sched: 16-19
        # iter_16: 0 - 0
        movl	64+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r8d, %ecx
        addl	%r12d, %edx
        movl	80(%rsp), %eax
        vpalignr	$12, %xmm0, %xmm1, %xmm5
        # iter_16: 1 - 1
        addl	%ecx, %edx
        movl	64(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpalignr	$8, %xmm2, %xmm3, %xmm4
        # iter_16: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r15d, %edx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_16: 3 - 3
        addl	%r11d, %ecx
        movl	%r9d, %r15d
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_16: 4 - 4
        xorl	%r10d, %ebx
        andn	%r14, %r12, %r11
        andl	%ebx, %r15d
        movl	%r12d, %ebx
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_16: 5 - 5
        xorl	%r9d, %r15d
        andl	%r13d, %ebx
        addl	%ecx, %r15d
        orl	%ebx, %r11d
        vpalignr	$12, %xmm1, %xmm2, %xmm10
        # iter_16: 6 - 7
        addl	%edx, %r11d
        rorxl	$15, %r11d, %ebx
        rorxl	$23, %r11d, %eax
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        xorl	%eax, %r11d
        vpxor	%xmm10, %xmm0, %xmm10
        # iter_17: 0 - 0
        movl	68+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r15d, %ecx
        addl	%r11d, %edx
        movl	84(%rsp), %eax
        vpshufd	$0xf9, %xmm3, %xmm4
        # iter_17: 1 - 2
        addl	%ecx, %edx
        movl	68(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r14d, %edx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_17: 3 - 3
        addl	%r10d, %ecx
        movl	%r8d, %r14d
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_17: 4 - 4
        xorl	%r9d, %ebx
        andn	%r13, %r11, %r10
        andl	%ebx, %r14d
        movl	%r11d, %ebx
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_17: 5 - 5
        xorl	%r8d, %r14d
        andl	%r12d, %ebx
        addl	%ecx, %r14d
        orl	%ebx, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_17: 6 - 6
        addl	%edx, %r10d
        rorxl	$15, %r10d, %ebx
        rorxl	$23, %r10d, %eax
        roll	$9, %r8d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_17: 7 - 7
        xorl	%ebx, %r10d
        roll	$19, %r12d
        xorl	%eax, %r10d
        # iter_18: 0 - 0
        movl	72+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r14d, %ecx
        addl	%r10d, %edx
        movl	88(%rsp), %eax
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_18: 1 - 1
        addl	%ecx, %edx
        movl	72(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_18: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r13d, %edx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_18: 3 - 4
        addl	%r9d, %ecx
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        andn	%r12, %r10, %r9
        andl	%ebx, %r13d
        movl	%r10d, %ebx
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_18: 5 - 5
        xorl	%r15d, %r13d
        andl	%r11d, %ebx
        addl	%ecx, %r13d
        orl	%ebx, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_18: 6 - 6
        addl	%edx, %r9d
        rorxl	$15, %r9d, %ebx
        rorxl	$23, %r9d, %eax
        roll	$9, %r15d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_18: 7 - 7
        xorl	%ebx, %r9d
        roll	$19, %r11d
        xorl	%eax, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_19: 0 - 0
        movl	76+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r13d, %ecx
        addl	%r9d, %edx
        movl	92(%rsp), %eax
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_19: 1 - 1
        addl	%ecx, %edx
        movl	76(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_19: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r12d, %edx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_19: 3 - 3
        addl	%r8d, %ecx
        movl	%r14d, %r12d
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_19: 4 - 4
        xorl	%r15d, %ebx
        andn	%r11, %r9, %r8
        andl	%ebx, %r12d
        movl	%r9d, %ebx
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_19: 5 - 5
        xorl	%r14d, %r12d
        andl	%r10d, %ebx
        addl	%ecx, %r12d
        orl	%ebx, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm0
        # iter_19: 6 - 6
        addl	%edx, %r8d
        rorxl	$15, %r8d, %ebx
        rorxl	$23, %r8d, %eax
        roll	$9, %r14d
        # iter_19: 7 - 7
        xorl	%ebx, %r8d
        roll	$19, %r10d
        xorl	%eax, %r8d
        # msg_sched done: 16-19
        # msg_sched: 20-23
        # iter_20: 0 - 0
        movl	80+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r12d, %ecx
        addl	%r8d, %edx
        movl	96(%rsp), %eax
        vpalignr	$12, %xmm1, %xmm2, %xmm5
        # iter_20: 1 - 1
        addl	%ecx, %edx
        movl	80(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpalignr	$8, %xmm3, %xmm0, %xmm4
        # iter_20: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r11d, %edx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_20: 3 - 3
        addl	%r15d, %ecx
        movl	%r13d, %r11d
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_20: 4 - 4
        xorl	%r14d, %ebx
        andn	%r10, %r8, %r15
        andl	%ebx, %r11d
        movl	%r8d, %ebx
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_20: 5 - 5
        xorl	%r13d, %r11d
        andl	%r9d, %ebx
        addl	%ecx, %r11d
        orl	%ebx, %r15d
        vpalignr	$12, %xmm2, %xmm3, %xmm10
        # iter_20: 6 - 7
        addl	%edx, %r15d
        rorxl	$15, %r15d, %ebx
        rorxl	$23, %r15d, %eax
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        xorl	%eax, %r15d
        vpxor	%xmm10, %xmm1, %xmm10
        # iter_21: 0 - 0
        movl	84+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r11d, %ecx
        addl	%r15d, %edx
        movl	100(%rsp), %eax
        vpshufd	$0xf9, %xmm0, %xmm4
        # iter_21: 1 - 2
        addl	%ecx, %edx
        movl	84(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r10d, %edx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_21: 3 - 3
        addl	%r14d, %ecx
        movl	%r12d, %r10d
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_21: 4 - 4
        xorl	%r13d, %ebx
        andn	%r9, %r15, %r14
        andl	%ebx, %r10d
        movl	%r15d, %ebx
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_21: 5 - 5
        xorl	%r12d, %r10d
        andl	%r8d, %ebx
        addl	%ecx, %r10d
        orl	%ebx, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_21: 6 - 6
        addl	%edx, %r14d
        rorxl	$15, %r14d, %ebx
        rorxl	$23, %r14d, %eax
        roll	$9, %r12d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_21: 7 - 7
        xorl	%ebx, %r14d
        roll	$19, %r8d
        xorl	%eax, %r14d
        # iter_22: 0 - 0
        movl	88+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r10d, %ecx
        addl	%r14d, %edx
        movl	104(%rsp), %eax
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_22: 1 - 1
        addl	%ecx, %edx
        movl	88(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_22: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r9d, %edx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_22: 3 - 4
        addl	%r13d, %ecx
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        andn	%r8, %r14, %r13
        andl	%ebx, %r9d
        movl	%r14d, %ebx
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_22: 5 - 5
        xorl	%r11d, %r9d
        andl	%r15d, %ebx
        addl	%ecx, %r9d
        orl	%ebx, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_22: 6 - 6
        addl	%edx, %r13d
        rorxl	$15, %r13d, %ebx
        rorxl	$23, %r13d, %eax
        roll	$9, %r11d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_22: 7 - 7
        xorl	%ebx, %r13d
        roll	$19, %r15d
        xorl	%eax, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_23: 0 - 0
        movl	92+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r9d, %ecx
        addl	%r13d, %edx
        movl	108(%rsp), %eax
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_23: 1 - 1
        addl	%ecx, %edx
        movl	92(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_23: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r8d, %edx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_23: 3 - 3
        addl	%r12d, %ecx
        movl	%r10d, %r8d
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_23: 4 - 4
        xorl	%r11d, %ebx
        andn	%r15, %r13, %r12
        andl	%ebx, %r8d
        movl	%r13d, %ebx
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_23: 5 - 5
        xorl	%r10d, %r8d
        andl	%r14d, %ebx
        addl	%ecx, %r8d
        orl	%ebx, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm1
        # iter_23: 6 - 6
        addl	%edx, %r12d
        rorxl	$15, %r12d, %ebx
        rorxl	$23, %r12d, %eax
        roll	$9, %r10d
        # iter_23: 7 - 7
        xorl	%ebx, %r12d
        roll	$19, %r14d
        xorl	%eax, %r12d
        # msg_sched done: 20-23
        # x2_to_w: 32
        vmovdqu	%xmm0, 128(%rsp)
        vmovdqu	%xmm1, 144(%rsp)
        # msg_sched: 24-27
        # iter_24: 0 - 0
        movl	96+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r8d, %ecx
        addl	%r12d, %edx
        movl	112(%rsp), %eax
        vpalignr	$12, %xmm2, %xmm3, %xmm5
        # iter_24: 1 - 1
        addl	%ecx, %edx
        movl	96(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpalignr	$8, %xmm0, %xmm1, %xmm4
        # iter_24: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r15d, %edx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_24: 3 - 3
        addl	%r11d, %ecx
        movl	%r9d, %r15d
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_24: 4 - 4
        xorl	%r10d, %ebx
        andn	%r14, %r12, %r11
        andl	%ebx, %r15d
        movl	%r12d, %ebx
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_24: 5 - 5
        xorl	%r9d, %r15d
        andl	%r13d, %ebx
        addl	%ecx, %r15d
        orl	%ebx, %r11d
        vpalignr	$12, %xmm3, %xmm0, %xmm10
        # iter_24: 6 - 7
        addl	%edx, %r11d
        rorxl	$15, %r11d, %ebx
        rorxl	$23, %r11d, %eax
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        xorl	%eax, %r11d
        vpxor	%xmm10, %xmm2, %xmm10
        # iter_25: 0 - 0
        movl	100+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r15d, %ecx
        addl	%r11d, %edx
        movl	116(%rsp), %eax
        vpshufd	$0xf9, %xmm1, %xmm4
        # iter_25: 1 - 2
        addl	%ecx, %edx
        movl	100(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r14d, %edx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_25: 3 - 3
        addl	%r10d, %ecx
        movl	%r8d, %r14d
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_25: 4 - 4
        xorl	%r9d, %ebx
        andn	%r13, %r11, %r10
        andl	%ebx, %r14d
        movl	%r11d, %ebx
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_25: 5 - 5
        xorl	%r8d, %r14d
        andl	%r12d, %ebx
        addl	%ecx, %r14d
        orl	%ebx, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_25: 6 - 6
        addl	%edx, %r10d
        rorxl	$15, %r10d, %ebx
        rorxl	$23, %r10d, %eax
        roll	$9, %r8d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_25: 7 - 7
        xorl	%ebx, %r10d
        roll	$19, %r12d
        xorl	%eax, %r10d
        # iter_26: 0 - 0
        movl	104+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r14d, %ecx
        addl	%r10d, %edx
        movl	120(%rsp), %eax
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_26: 1 - 1
        addl	%ecx, %edx
        movl	104(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_26: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r13d, %edx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_26: 3 - 4
        addl	%r9d, %ecx
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        andn	%r12, %r10, %r9
        andl	%ebx, %r13d
        movl	%r10d, %ebx
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_26: 5 - 5
        xorl	%r15d, %r13d
        andl	%r11d, %ebx
        addl	%ecx, %r13d
        orl	%ebx, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_26: 6 - 6
        addl	%edx, %r9d
        rorxl	$15, %r9d, %ebx
        rorxl	$23, %r9d, %eax
        roll	$9, %r15d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_26: 7 - 7
        xorl	%ebx, %r9d
        roll	$19, %r11d
        xorl	%eax, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_27: 0 - 0
        movl	108+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r13d, %ecx
        addl	%r9d, %edx
        movl	124(%rsp), %eax
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_27: 1 - 1
        addl	%ecx, %edx
        movl	108(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_27: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r12d, %edx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_27: 3 - 3
        addl	%r8d, %ecx
        movl	%r14d, %r12d
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_27: 4 - 4
        xorl	%r15d, %ebx
        andn	%r11, %r9, %r8
        andl	%ebx, %r12d
        movl	%r9d, %ebx
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_27: 5 - 5
        xorl	%r14d, %r12d
        andl	%r10d, %ebx
        addl	%ecx, %r12d
        orl	%ebx, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm2
        # iter_27: 6 - 6
        addl	%edx, %r8d
        rorxl	$15, %r8d, %ebx
        rorxl	$23, %r8d, %eax
        roll	$9, %r14d
        # iter_27: 7 - 7
        xorl	%ebx, %r8d
        roll	$19, %r10d
        xorl	%eax, %r8d
        # msg_sched done: 24-27
        # msg_sched: 28-31
        # iter_28: 0 - 0
        movl	112+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r12d, %ecx
        addl	%r8d, %edx
        movl	128(%rsp), %eax
        vpalignr	$12, %xmm3, %xmm0, %xmm5
        # iter_28: 1 - 1
        addl	%ecx, %edx
        movl	112(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpalignr	$8, %xmm1, %xmm2, %xmm4
        # iter_28: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r11d, %edx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_28: 3 - 3
        addl	%r15d, %ecx
        movl	%r13d, %r11d
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_28: 4 - 4
        xorl	%r14d, %ebx
        andn	%r10, %r8, %r15
        andl	%ebx, %r11d
        movl	%r8d, %ebx
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_28: 5 - 5
        xorl	%r13d, %r11d
        andl	%r9d, %ebx
        addl	%ecx, %r11d
        orl	%ebx, %r15d
        vpalignr	$12, %xmm0, %xmm1, %xmm10
        # iter_28: 6 - 7
        addl	%edx, %r15d
        rorxl	$15, %r15d, %ebx
        rorxl	$23, %r15d, %eax
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        xorl	%eax, %r15d
        vpxor	%xmm10, %xmm3, %xmm10
        # iter_29: 0 - 0
        movl	116+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r11d, %ecx
        addl	%r15d, %edx
        movl	132(%rsp), %eax
        vpshufd	$0xf9, %xmm2, %xmm4
        # iter_29: 1 - 2
        addl	%ecx, %edx
        movl	116(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r10d, %edx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_29: 3 - 3
        addl	%r14d, %ecx
        movl	%r12d, %r10d
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_29: 4 - 4
        xorl	%r13d, %ebx
        andn	%r9, %r15, %r14
        andl	%ebx, %r10d
        movl	%r15d, %ebx
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_29: 5 - 5
        xorl	%r12d, %r10d
        andl	%r8d, %ebx
        addl	%ecx, %r10d
        orl	%ebx, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_29: 6 - 6
        addl	%edx, %r14d
        rorxl	$15, %r14d, %ebx
        rorxl	$23, %r14d, %eax
        roll	$9, %r12d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_29: 7 - 7
        xorl	%ebx, %r14d
        roll	$19, %r8d
        xorl	%eax, %r14d
        # iter_30: 0 - 0
        movl	120+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r10d, %ecx
        addl	%r14d, %edx
        movl	136(%rsp), %eax
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_30: 1 - 1
        addl	%ecx, %edx
        movl	120(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_30: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r9d, %edx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_30: 3 - 4
        addl	%r13d, %ecx
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        andn	%r8, %r14, %r13
        andl	%ebx, %r9d
        movl	%r14d, %ebx
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_30: 5 - 5
        xorl	%r11d, %r9d
        andl	%r15d, %ebx
        addl	%ecx, %r9d
        orl	%ebx, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_30: 6 - 6
        addl	%edx, %r13d
        rorxl	$15, %r13d, %ebx
        rorxl	$23, %r13d, %eax
        roll	$9, %r11d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_30: 7 - 7
        xorl	%ebx, %r13d
        roll	$19, %r15d
        xorl	%eax, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_31: 0 - 0
        movl	124+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r9d, %ecx
        addl	%r13d, %edx
        movl	140(%rsp), %eax
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_31: 1 - 1
        addl	%ecx, %edx
        movl	124(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_31: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r8d, %edx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_31: 3 - 3
        addl	%r12d, %ecx
        movl	%r10d, %r8d
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_31: 4 - 4
        xorl	%r11d, %ebx
        andn	%r15, %r13, %r12
        andl	%ebx, %r8d
        movl	%r13d, %ebx
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_31: 5 - 5
        xorl	%r10d, %r8d
        andl	%r14d, %ebx
        addl	%ecx, %r8d
        orl	%ebx, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm3
        # iter_31: 6 - 6
        addl	%edx, %r12d
        rorxl	$15, %r12d, %ebx
        rorxl	$23, %r12d, %eax
        roll	$9, %r10d
        # iter_31: 7 - 7
        xorl	%ebx, %r12d
        roll	$19, %r14d
        xorl	%eax, %r12d
        # msg_sched done: 28-31
        # x2_to_w: 40
        vmovdqu	%xmm2, 160(%rsp)
        vmovdqu	%xmm3, 176(%rsp)
        # msg_sched: 32-35
        # iter_32: 0 - 0
        movl	128+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r8d, %ecx
        addl	%r12d, %edx
        movl	144(%rsp), %eax
        vpalignr	$12, %xmm0, %xmm1, %xmm5
        # iter_32: 1 - 1
        addl	%ecx, %edx
        movl	128(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpalignr	$8, %xmm2, %xmm3, %xmm4
        # iter_32: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r15d, %edx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_32: 3 - 3
        addl	%r11d, %ecx
        movl	%r9d, %r15d
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_32: 4 - 4
        xorl	%r10d, %ebx
        andn	%r14, %r12, %r11
        andl	%ebx, %r15d
        movl	%r12d, %ebx
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_32: 5 - 5
        xorl	%r9d, %r15d
        andl	%r13d, %ebx
        addl	%ecx, %r15d
        orl	%ebx, %r11d
        vpalignr	$12, %xmm1, %xmm2, %xmm10
        # iter_32: 6 - 7
        addl	%edx, %r11d
        rorxl	$15, %r11d, %ebx
        rorxl	$23, %r11d, %eax
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        xorl	%eax, %r11d
        vpxor	%xmm10, %xmm0, %xmm10
        # iter_33: 0 - 0
        movl	132+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r15d, %ecx
        addl	%r11d, %edx
        movl	148(%rsp), %eax
        vpshufd	$0xf9, %xmm3, %xmm4
        # iter_33: 1 - 2
        addl	%ecx, %edx
        movl	132(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r14d, %edx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_33: 3 - 3
        addl	%r10d, %ecx
        movl	%r8d, %r14d
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_33: 4 - 4
        xorl	%r9d, %ebx
        andn	%r13, %r11, %r10
        andl	%ebx, %r14d
        movl	%r11d, %ebx
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_33: 5 - 5
        xorl	%r8d, %r14d
        andl	%r12d, %ebx
        addl	%ecx, %r14d
        orl	%ebx, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_33: 6 - 6
        addl	%edx, %r10d
        rorxl	$15, %r10d, %ebx
        rorxl	$23, %r10d, %eax
        roll	$9, %r8d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_33: 7 - 7
        xorl	%ebx, %r10d
        roll	$19, %r12d
        xorl	%eax, %r10d
        # iter_34: 0 - 0
        movl	136+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r14d, %ecx
        addl	%r10d, %edx
        movl	152(%rsp), %eax
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_34: 1 - 1
        addl	%ecx, %edx
        movl	136(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_34: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r13d, %edx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_34: 3 - 4
        addl	%r9d, %ecx
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        andn	%r12, %r10, %r9
        andl	%ebx, %r13d
        movl	%r10d, %ebx
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_34: 5 - 5
        xorl	%r15d, %r13d
        andl	%r11d, %ebx
        addl	%ecx, %r13d
        orl	%ebx, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_34: 6 - 6
        addl	%edx, %r9d
        rorxl	$15, %r9d, %ebx
        rorxl	$23, %r9d, %eax
        roll	$9, %r15d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_34: 7 - 7
        xorl	%ebx, %r9d
        roll	$19, %r11d
        xorl	%eax, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_35: 0 - 0
        movl	140+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r13d, %ecx
        addl	%r9d, %edx
        movl	156(%rsp), %eax
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_35: 1 - 1
        addl	%ecx, %edx
        movl	140(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_35: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r12d, %edx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_35: 3 - 3
        addl	%r8d, %ecx
        movl	%r14d, %r12d
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_35: 4 - 4
        xorl	%r15d, %ebx
        andn	%r11, %r9, %r8
        andl	%ebx, %r12d
        movl	%r9d, %ebx
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_35: 5 - 5
        xorl	%r14d, %r12d
        andl	%r10d, %ebx
        addl	%ecx, %r12d
        orl	%ebx, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm0
        # iter_35: 6 - 6
        addl	%edx, %r8d
        rorxl	$15, %r8d, %ebx
        rorxl	$23, %r8d, %eax
        roll	$9, %r14d
        # iter_35: 7 - 7
        xorl	%ebx, %r8d
        roll	$19, %r10d
        xorl	%eax, %r8d
        # msg_sched done: 32-35
        # msg_sched: 36-39
        # iter_36: 0 - 0
        movl	144+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r12d, %ecx
        addl	%r8d, %edx
        movl	160(%rsp), %eax
        vpalignr	$12, %xmm1, %xmm2, %xmm5
        # iter_36: 1 - 1
        addl	%ecx, %edx
        movl	144(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpalignr	$8, %xmm3, %xmm0, %xmm4
        # iter_36: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r11d, %edx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_36: 3 - 3
        addl	%r15d, %ecx
        movl	%r13d, %r11d
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_36: 4 - 4
        xorl	%r14d, %ebx
        andn	%r10, %r8, %r15
        andl	%ebx, %r11d
        movl	%r8d, %ebx
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_36: 5 - 5
        xorl	%r13d, %r11d
        andl	%r9d, %ebx
        addl	%ecx, %r11d
        orl	%ebx, %r15d
        vpalignr	$12, %xmm2, %xmm3, %xmm10
        # iter_36: 6 - 7
        addl	%edx, %r15d
        rorxl	$15, %r15d, %ebx
        rorxl	$23, %r15d, %eax
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        xorl	%eax, %r15d
        vpxor	%xmm10, %xmm1, %xmm10
        # iter_37: 0 - 0
        movl	148+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r11d, %ecx
        addl	%r15d, %edx
        movl	164(%rsp), %eax
        vpshufd	$0xf9, %xmm0, %xmm4
        # iter_37: 1 - 2
        addl	%ecx, %edx
        movl	148(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r10d, %edx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_37: 3 - 3
        addl	%r14d, %ecx
        movl	%r12d, %r10d
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_37: 4 - 4
        xorl	%r13d, %ebx
        andn	%r9, %r15, %r14
        andl	%ebx, %r10d
        movl	%r15d, %ebx
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_37: 5 - 5
        xorl	%r12d, %r10d
        andl	%r8d, %ebx
        addl	%ecx, %r10d
        orl	%ebx, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_37: 6 - 6
        addl	%edx, %r14d
        rorxl	$15, %r14d, %ebx
        rorxl	$23, %r14d, %eax
        roll	$9, %r12d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_37: 7 - 7
        xorl	%ebx, %r14d
        roll	$19, %r8d
        xorl	%eax, %r14d
        # iter_38: 0 - 0
        movl	152+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r10d, %ecx
        addl	%r14d, %edx
        movl	168(%rsp), %eax
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_38: 1 - 1
        addl	%ecx, %edx
        movl	152(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_38: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r9d, %edx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_38: 3 - 4
        addl	%r13d, %ecx
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        andn	%r8, %r14, %r13
        andl	%ebx, %r9d
        movl	%r14d, %ebx
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_38: 5 - 5
        xorl	%r11d, %r9d
        andl	%r15d, %ebx
        addl	%ecx, %r9d
        orl	%ebx, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_38: 6 - 6
        addl	%edx, %r13d
        rorxl	$15, %r13d, %ebx
        rorxl	$23, %r13d, %eax
        roll	$9, %r11d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_38: 7 - 7
        xorl	%ebx, %r13d
        roll	$19, %r15d
        xorl	%eax, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_39: 0 - 0
        movl	156+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r9d, %ecx
        addl	%r13d, %edx
        movl	172(%rsp), %eax
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_39: 1 - 1
        addl	%ecx, %edx
        movl	156(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_39: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r8d, %edx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_39: 3 - 3
        addl	%r12d, %ecx
        movl	%r10d, %r8d
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_39: 4 - 4
        xorl	%r11d, %ebx
        andn	%r15, %r13, %r12
        andl	%ebx, %r8d
        movl	%r13d, %ebx
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_39: 5 - 5
        xorl	%r10d, %r8d
        andl	%r14d, %ebx
        addl	%ecx, %r8d
        orl	%ebx, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm1
        # iter_39: 6 - 6
        addl	%edx, %r12d
        rorxl	$15, %r12d, %ebx
        rorxl	$23, %r12d, %eax
        roll	$9, %r10d
        # iter_39: 7 - 7
        xorl	%ebx, %r12d
        roll	$19, %r14d
        xorl	%eax, %r12d
        # msg_sched done: 36-39
        # x2_to_w: 48
        vmovdqu	%xmm0, 192(%rsp)
        vmovdqu	%xmm1, 208(%rsp)
        # msg_sched: 40-43
        # iter_40: 0 - 0
        movl	160+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r8d, %ecx
        addl	%r12d, %edx
        movl	176(%rsp), %eax
        vpalignr	$12, %xmm2, %xmm3, %xmm5
        # iter_40: 1 - 1
        addl	%ecx, %edx
        movl	160(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpalignr	$8, %xmm0, %xmm1, %xmm4
        # iter_40: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r15d, %edx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_40: 3 - 3
        addl	%r11d, %ecx
        movl	%r9d, %r15d
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_40: 4 - 4
        xorl	%r10d, %ebx
        andn	%r14, %r12, %r11
        andl	%ebx, %r15d
        movl	%r12d, %ebx
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_40: 5 - 5
        xorl	%r9d, %r15d
        andl	%r13d, %ebx
        addl	%ecx, %r15d
        orl	%ebx, %r11d
        vpalignr	$12, %xmm3, %xmm0, %xmm10
        # iter_40: 6 - 7
        addl	%edx, %r11d
        rorxl	$15, %r11d, %ebx
        rorxl	$23, %r11d, %eax
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        xorl	%eax, %r11d
        vpxor	%xmm10, %xmm2, %xmm10
        # iter_41: 0 - 0
        movl	164+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r15d, %ecx
        addl	%r11d, %edx
        movl	180(%rsp), %eax
        vpshufd	$0xf9, %xmm1, %xmm4
        # iter_41: 1 - 2
        addl	%ecx, %edx
        movl	164(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r14d, %edx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_41: 3 - 3
        addl	%r10d, %ecx
        movl	%r8d, %r14d
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_41: 4 - 4
        xorl	%r9d, %ebx
        andn	%r13, %r11, %r10
        andl	%ebx, %r14d
        movl	%r11d, %ebx
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_41: 5 - 5
        xorl	%r8d, %r14d
        andl	%r12d, %ebx
        addl	%ecx, %r14d
        orl	%ebx, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_41: 6 - 6
        addl	%edx, %r10d
        rorxl	$15, %r10d, %ebx
        rorxl	$23, %r10d, %eax
        roll	$9, %r8d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_41: 7 - 7
        xorl	%ebx, %r10d
        roll	$19, %r12d
        xorl	%eax, %r10d
        # iter_42: 0 - 0
        movl	168+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r14d, %ecx
        addl	%r10d, %edx
        movl	184(%rsp), %eax
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_42: 1 - 1
        addl	%ecx, %edx
        movl	168(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_42: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r13d, %edx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_42: 3 - 4
        addl	%r9d, %ecx
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        andn	%r12, %r10, %r9
        andl	%ebx, %r13d
        movl	%r10d, %ebx
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_42: 5 - 5
        xorl	%r15d, %r13d
        andl	%r11d, %ebx
        addl	%ecx, %r13d
        orl	%ebx, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_42: 6 - 6
        addl	%edx, %r9d
        rorxl	$15, %r9d, %ebx
        rorxl	$23, %r9d, %eax
        roll	$9, %r15d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_42: 7 - 7
        xorl	%ebx, %r9d
        roll	$19, %r11d
        xorl	%eax, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_43: 0 - 0
        movl	172+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r13d, %ecx
        addl	%r9d, %edx
        movl	188(%rsp), %eax
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_43: 1 - 1
        addl	%ecx, %edx
        movl	172(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_43: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r12d, %edx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_43: 3 - 3
        addl	%r8d, %ecx
        movl	%r14d, %r12d
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_43: 4 - 4
        xorl	%r15d, %ebx
        andn	%r11, %r9, %r8
        andl	%ebx, %r12d
        movl	%r9d, %ebx
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_43: 5 - 5
        xorl	%r14d, %r12d
        andl	%r10d, %ebx
        addl	%ecx, %r12d
        orl	%ebx, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm2
        # iter_43: 6 - 6
        addl	%edx, %r8d
        rorxl	$15, %r8d, %ebx
        rorxl	$23, %r8d, %eax
        roll	$9, %r14d
        # iter_43: 7 - 7
        xorl	%ebx, %r8d
        roll	$19, %r10d
        xorl	%eax, %r8d
        # msg_sched done: 40-43
        # msg_sched: 44-47
        # iter_44: 0 - 0
        movl	176+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r12d, %ecx
        addl	%r8d, %edx
        movl	192(%rsp), %eax
        vpalignr	$12, %xmm3, %xmm0, %xmm5
        # iter_44: 1 - 1
        addl	%ecx, %edx
        movl	176(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpalignr	$8, %xmm1, %xmm2, %xmm4
        # iter_44: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r11d, %edx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_44: 3 - 3
        addl	%r15d, %ecx
        movl	%r13d, %r11d
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_44: 4 - 4
        xorl	%r14d, %ebx
        andn	%r10, %r8, %r15
        andl	%ebx, %r11d
        movl	%r8d, %ebx
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_44: 5 - 5
        xorl	%r13d, %r11d
        andl	%r9d, %ebx
        addl	%ecx, %r11d
        orl	%ebx, %r15d
        vpalignr	$12, %xmm0, %xmm1, %xmm10
        # iter_44: 6 - 7
        addl	%edx, %r15d
        rorxl	$15, %r15d, %ebx
        rorxl	$23, %r15d, %eax
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        xorl	%eax, %r15d
        vpxor	%xmm10, %xmm3, %xmm10
        # iter_45: 0 - 0
        movl	180+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r11d, %ecx
        addl	%r15d, %edx
        movl	196(%rsp), %eax
        vpshufd	$0xf9, %xmm2, %xmm4
        # iter_45: 1 - 2
        addl	%ecx, %edx
        movl	180(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r10d, %edx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_45: 3 - 3
        addl	%r14d, %ecx
        movl	%r12d, %r10d
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_45: 4 - 4
        xorl	%r13d, %ebx
        andn	%r9, %r15, %r14
        andl	%ebx, %r10d
        movl	%r15d, %ebx
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_45: 5 - 5
        xorl	%r12d, %r10d
        andl	%r8d, %ebx
        addl	%ecx, %r10d
        orl	%ebx, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_45: 6 - 6
        addl	%edx, %r14d
        rorxl	$15, %r14d, %ebx
        rorxl	$23, %r14d, %eax
        roll	$9, %r12d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_45: 7 - 7
        xorl	%ebx, %r14d
        roll	$19, %r8d
        xorl	%eax, %r14d
        # iter_46: 0 - 0
        movl	184+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r10d, %ecx
        addl	%r14d, %edx
        movl	200(%rsp), %eax
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_46: 1 - 1
        addl	%ecx, %edx
        movl	184(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_46: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r9d, %edx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_46: 3 - 4
        addl	%r13d, %ecx
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        andn	%r8, %r14, %r13
        andl	%ebx, %r9d
        movl	%r14d, %ebx
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_46: 5 - 5
        xorl	%r11d, %r9d
        andl	%r15d, %ebx
        addl	%ecx, %r9d
        orl	%ebx, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_46: 6 - 6
        addl	%edx, %r13d
        rorxl	$15, %r13d, %ebx
        rorxl	$23, %r13d, %eax
        roll	$9, %r11d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_46: 7 - 7
        xorl	%ebx, %r13d
        roll	$19, %r15d
        xorl	%eax, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_47: 0 - 0
        movl	188+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r9d, %ecx
        addl	%r13d, %edx
        movl	204(%rsp), %eax
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_47: 1 - 1
        addl	%ecx, %edx
        movl	188(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_47: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r8d, %edx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_47: 3 - 3
        addl	%r12d, %ecx
        movl	%r10d, %r8d
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_47: 4 - 4
        xorl	%r11d, %ebx
        andn	%r15, %r13, %r12
        andl	%ebx, %r8d
        movl	%r13d, %ebx
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_47: 5 - 5
        xorl	%r10d, %r8d
        andl	%r14d, %ebx
        addl	%ecx, %r8d
        orl	%ebx, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm3
        # iter_47: 6 - 6
        addl	%edx, %r12d
        rorxl	$15, %r12d, %ebx
        rorxl	$23, %r12d, %eax
        roll	$9, %r10d
        # iter_47: 7 - 7
        xorl	%ebx, %r12d
        roll	$19, %r14d
        xorl	%eax, %r12d
        # msg_sched done: 44-47
        # x2_to_w: 56
        vmovdqu	%xmm2, 224(%rsp)
        vmovdqu	%xmm3, 240(%rsp)
        # msg_sched: 48-51
        # iter_48: 0 - 0
        movl	192+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r8d, %ecx
        addl	%r12d, %edx
        movl	208(%rsp), %eax
        vpalignr	$12, %xmm0, %xmm1, %xmm5
        # iter_48: 1 - 1
        addl	%ecx, %edx
        movl	192(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpalignr	$8, %xmm2, %xmm3, %xmm4
        # iter_48: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r15d, %edx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_48: 3 - 3
        addl	%r11d, %ecx
        movl	%r9d, %r15d
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_48: 4 - 4
        xorl	%r10d, %ebx
        andn	%r14, %r12, %r11
        andl	%ebx, %r15d
        movl	%r12d, %ebx
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_48: 5 - 5
        xorl	%r9d, %r15d
        andl	%r13d, %ebx
        addl	%ecx, %r15d
        orl	%ebx, %r11d
        vpalignr	$12, %xmm1, %xmm2, %xmm10
        # iter_48: 6 - 7
        addl	%edx, %r11d
        rorxl	$15, %r11d, %ebx
        rorxl	$23, %r11d, %eax
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        xorl	%eax, %r11d
        vpxor	%xmm10, %xmm0, %xmm10
        # iter_49: 0 - 0
        movl	196+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r15d, %ecx
        addl	%r11d, %edx
        movl	212(%rsp), %eax
        vpshufd	$0xf9, %xmm3, %xmm4
        # iter_49: 1 - 2
        addl	%ecx, %edx
        movl	196(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r14d, %edx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_49: 3 - 3
        addl	%r10d, %ecx
        movl	%r8d, %r14d
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_49: 4 - 4
        xorl	%r9d, %ebx
        andn	%r13, %r11, %r10
        andl	%ebx, %r14d
        movl	%r11d, %ebx
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_49: 5 - 5
        xorl	%r8d, %r14d
        andl	%r12d, %ebx
        addl	%ecx, %r14d
        orl	%ebx, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_49: 6 - 6
        addl	%edx, %r10d
        rorxl	$15, %r10d, %ebx
        rorxl	$23, %r10d, %eax
        roll	$9, %r8d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_49: 7 - 7
        xorl	%ebx, %r10d
        roll	$19, %r12d
        xorl	%eax, %r10d
        # iter_50: 0 - 0
        movl	200+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r14d, %ecx
        addl	%r10d, %edx
        movl	216(%rsp), %eax
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_50: 1 - 1
        addl	%ecx, %edx
        movl	200(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_50: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r13d, %edx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_50: 3 - 4
        addl	%r9d, %ecx
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        andn	%r12, %r10, %r9
        andl	%ebx, %r13d
        movl	%r10d, %ebx
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_50: 5 - 5
        xorl	%r15d, %r13d
        andl	%r11d, %ebx
        addl	%ecx, %r13d
        orl	%ebx, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_50: 6 - 6
        addl	%edx, %r9d
        rorxl	$15, %r9d, %ebx
        rorxl	$23, %r9d, %eax
        roll	$9, %r15d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_50: 7 - 7
        xorl	%ebx, %r9d
        roll	$19, %r11d
        xorl	%eax, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_51: 0 - 0
        movl	204+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r13d, %ecx
        addl	%r9d, %edx
        movl	220(%rsp), %eax
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_51: 1 - 1
        addl	%ecx, %edx
        movl	204(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_51: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r12d, %edx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_51: 3 - 3
        addl	%r8d, %ecx
        movl	%r14d, %r12d
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_51: 4 - 4
        xorl	%r15d, %ebx
        andn	%r11, %r9, %r8
        andl	%ebx, %r12d
        movl	%r9d, %ebx
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_51: 5 - 5
        xorl	%r14d, %r12d
        andl	%r10d, %ebx
        addl	%ecx, %r12d
        orl	%ebx, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm0
        # iter_51: 6 - 6
        addl	%edx, %r8d
        rorxl	$15, %r8d, %ebx
        rorxl	$23, %r8d, %eax
        roll	$9, %r14d
        # iter_51: 7 - 7
        xorl	%ebx, %r8d
        roll	$19, %r10d
        xorl	%eax, %r8d
        # msg_sched done: 48-51
        # iter_52: 0 - 7
        movl	208+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r12d, %ecx
        addl	%r8d, %edx
        movl	224(%rsp), %eax
        addl	%ecx, %edx
        movl	208(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r13d, %r11d
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        xorl	%r14d, %ebx
        andn	%r10, %r8, %r15
        andl	%ebx, %r11d
        movl	%r8d, %ebx
        xorl	%r13d, %r11d
        andl	%r9d, %ebx
        addl	%ecx, %r11d
        orl	%ebx, %r15d
        addl	%edx, %r15d
        rorxl	$15, %r15d, %ebx
        rorxl	$23, %r15d, %eax
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        xorl	%eax, %r15d
        # iter_53: 0 - 7
        movl	212+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r11d, %ecx
        addl	%r15d, %edx
        movl	228(%rsp), %eax
        addl	%ecx, %edx
        movl	212(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r12d, %r10d
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        xorl	%r13d, %ebx
        andn	%r9, %r15, %r14
        andl	%ebx, %r10d
        movl	%r15d, %ebx
        xorl	%r12d, %r10d
        andl	%r8d, %ebx
        addl	%ecx, %r10d
        orl	%ebx, %r14d
        addl	%edx, %r14d
        rorxl	$15, %r14d, %ebx
        rorxl	$23, %r14d, %eax
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        xorl	%eax, %r14d
        # iter_54: 0 - 7
        movl	216+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r10d, %ecx
        addl	%r14d, %edx
        movl	232(%rsp), %eax
        addl	%ecx, %edx
        movl	216(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        andn	%r8, %r14, %r13
        andl	%ebx, %r9d
        movl	%r14d, %ebx
        xorl	%r11d, %r9d
        andl	%r15d, %ebx
        addl	%ecx, %r9d
        orl	%ebx, %r13d
        addl	%edx, %r13d
        rorxl	$15, %r13d, %ebx
        rorxl	$23, %r13d, %eax
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        xorl	%eax, %r13d
        # iter_55: 0 - 7
        movl	220+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r9d, %ecx
        addl	%r13d, %edx
        movl	236(%rsp), %eax
        addl	%ecx, %edx
        movl	220(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r10d, %r8d
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        xorl	%r11d, %ebx
        andn	%r15, %r13, %r12
        andl	%ebx, %r8d
        movl	%r13d, %ebx
        xorl	%r10d, %r8d
        andl	%r14d, %ebx
        addl	%ecx, %r8d
        orl	%ebx, %r12d
        addl	%edx, %r12d
        rorxl	$15, %r12d, %ebx
        rorxl	$23, %r12d, %eax
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        xorl	%eax, %r12d
        # x0_to_w: 64
        vmovdqu	%xmm0, 256(%rsp)
        # iter_56: 0 - 7
        movl	224+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r8d, %ecx
        addl	%r12d, %edx
        movl	240(%rsp), %eax
        addl	%ecx, %edx
        movl	224(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r9d, %r15d
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        xorl	%r10d, %ebx
        andn	%r14, %r12, %r11
        andl	%ebx, %r15d
        movl	%r12d, %ebx
        xorl	%r9d, %r15d
        andl	%r13d, %ebx
        addl	%ecx, %r15d
        orl	%ebx, %r11d
        addl	%edx, %r11d
        rorxl	$15, %r11d, %ebx
        rorxl	$23, %r11d, %eax
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        xorl	%eax, %r11d
        # iter_57: 0 - 7
        movl	228+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r15d, %ecx
        addl	%r11d, %edx
        movl	244(%rsp), %eax
        addl	%ecx, %edx
        movl	228(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r8d, %r14d
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        xorl	%r9d, %ebx
        andn	%r13, %r11, %r10
        andl	%ebx, %r14d
        movl	%r11d, %ebx
        xorl	%r8d, %r14d
        andl	%r12d, %ebx
        addl	%ecx, %r14d
        orl	%ebx, %r10d
        addl	%edx, %r10d
        rorxl	$15, %r10d, %ebx
        rorxl	$23, %r10d, %eax
        roll	$9, %r8d
        xorl	%ebx, %r10d
        roll	$19, %r12d
        xorl	%eax, %r10d
        # iter_58: 0 - 7
        movl	232+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r14d, %ecx
        addl	%r10d, %edx
        movl	248(%rsp), %eax
        addl	%ecx, %edx
        movl	232(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        andn	%r12, %r10, %r9
        andl	%ebx, %r13d
        movl	%r10d, %ebx
        xorl	%r15d, %r13d
        andl	%r11d, %ebx
        addl	%ecx, %r13d
        orl	%ebx, %r9d
        addl	%edx, %r9d
        rorxl	$15, %r9d, %ebx
        rorxl	$23, %r9d, %eax
        roll	$9, %r15d
        xorl	%ebx, %r9d
        roll	$19, %r11d
        xorl	%eax, %r9d
        # iter_59: 0 - 7
        movl	236+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r13d, %ecx
        addl	%r9d, %edx
        movl	252(%rsp), %eax
        addl	%ecx, %edx
        movl	236(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r14d, %r12d
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        xorl	%r15d, %ebx
        andn	%r11, %r9, %r8
        andl	%ebx, %r12d
        movl	%r9d, %ebx
        xorl	%r14d, %r12d
        andl	%r10d, %ebx
        addl	%ecx, %r12d
        orl	%ebx, %r8d
        addl	%edx, %r8d
        rorxl	$15, %r8d, %ebx
        rorxl	$23, %r8d, %eax
        roll	$9, %r14d
        xorl	%ebx, %r8d
        roll	$19, %r10d
        xorl	%eax, %r8d
        # iter_60: 0 - 7
        movl	240+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r12d, %ecx
        addl	%r8d, %edx
        movl	256(%rsp), %eax
        addl	%ecx, %edx
        movl	240(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r13d, %r11d
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        xorl	%r14d, %ebx
        andn	%r10, %r8, %r15
        andl	%ebx, %r11d
        movl	%r8d, %ebx
        xorl	%r13d, %r11d
        andl	%r9d, %ebx
        addl	%ecx, %r11d
        orl	%ebx, %r15d
        addl	%edx, %r15d
        rorxl	$15, %r15d, %ebx
        rorxl	$23, %r15d, %eax
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        xorl	%eax, %r15d
        # iter_61: 0 - 7
        movl	244+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r11d, %ecx
        addl	%r15d, %edx
        movl	260(%rsp), %eax
        addl	%ecx, %edx
        movl	244(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r12d, %r10d
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        xorl	%r13d, %ebx
        andn	%r9, %r15, %r14
        andl	%ebx, %r10d
        movl	%r15d, %ebx
        xorl	%r12d, %r10d
        andl	%r8d, %ebx
        addl	%ecx, %r10d
        orl	%ebx, %r14d
        addl	%edx, %r14d
        rorxl	$15, %r14d, %ebx
        rorxl	$23, %r14d, %eax
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        xorl	%eax, %r14d
        # iter_62: 0 - 7
        movl	248+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r10d, %ecx
        addl	%r14d, %edx
        movl	264(%rsp), %eax
        addl	%ecx, %edx
        movl	248(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        andn	%r8, %r14, %r13
        andl	%ebx, %r9d
        movl	%r14d, %ebx
        xorl	%r11d, %r9d
        andl	%r15d, %ebx
        addl	%ecx, %r9d
        orl	%ebx, %r13d
        addl	%edx, %r13d
        rorxl	$15, %r13d, %ebx
        rorxl	$23, %r13d, %eax
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        xorl	%eax, %r13d
        # iter_63: 0 - 7
        movl	252+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r9d, %ecx
        addl	%r13d, %edx
        movl	268(%rsp), %eax
        addl	%ecx, %edx
        movl	252(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r10d, %r8d
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        xorl	%r11d, %ebx
        andn	%r15, %r13, %r12
        andl	%ebx, %r8d
        movl	%r13d, %ebx
        xorl	%r10d, %r8d
        andl	%r14d, %ebx
        addl	%ecx, %r8d
        orl	%ebx, %r12d
        addl	%edx, %r12d
        rorxl	$15, %r12d, %ebx
        rorxl	$23, %r12d, %eax
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        xorl	%eax, %r12d
        xorl	%r8d, (%rdi)
        xorl	%r9d, 4(%rdi)
        xorl	%r10d, 8(%rdi)
        xorl	%r11d, 12(%rdi)
        xorl	%r12d, 16(%rdi)
        xorl	%r13d, 20(%rdi)
        xorl	%r14d, 24(%rdi)
        xorl	%r15d, 28(%rdi)
        xorq	%rax, %rax
        vzeroupper
        addq	$0x110, %rsp
        popq	%r15
        popq	%r14
        popq	%r13
        popq	%r12
        popq	%rbx
        repz retq
#ifndef __APPLE__
.size	sm3_compress_avx1_rorx,.-sm3_compress_avx1_rorx
#endif /* __APPLE__ */
#ifndef __APPLE__
.text
.globl	sm3_compress_len_avx1_rorx
.type	sm3_compress_len_avx1_rorx,@function
.align	16
sm3_compress_len_avx1_rorx:
#else
.section	__TEXT,__text
.globl	_sm3_compress_len_avx1_rorx
.p2align	4
_sm3_compress_len_avx1_rorx:
#endif /* __APPLE__ */
        pushq	%rbx
        pushq	%r12
        pushq	%r13
        pushq	%r14
        pushq	%r15
        pushq	%rbp
        movq	%rsi, %rbp
        movq	%rdx, %rsi
        subq	$0x110, %rsp
        vmovdqa	L_SM3_AVX1_RORX_flip_mask(%rip), %xmm11
        movl	(%rdi), %r8d
        movl	4(%rdi), %r9d
        movl	8(%rdi), %r10d
        movl	12(%rdi), %r11d
        movl	16(%rdi), %r12d
        movl	20(%rdi), %r13d
        movl	24(%rdi), %r14d
        movl	28(%rdi), %r15d
        # Start of loop processing a block
L_SM3_AVX1_RORXlen_start:
        # X0, X1, X2, X3 = W[0..15]
        vmovdqu	(%rbp), %xmm0
        vmovdqu	16(%rbp), %xmm1
        vpshufb	%xmm11, %xmm0, %xmm0
        vpshufb	%xmm11, %xmm1, %xmm1
        vmovdqu	32(%rbp), %xmm2
        vmovdqu	48(%rbp), %xmm3
        vpshufb	%xmm11, %xmm2, %xmm2
        vpshufb	%xmm11, %xmm3, %xmm3
        # x_to_w: 0
        vmovdqu	%xmm0, (%rsp)
        vmovdqu	%xmm1, 16(%rsp)
        vmovdqu	%xmm2, 32(%rsp)
        vmovdqu	%xmm3, 48(%rsp)
        # msg_sched: 0-3
        # iter_0: 0 - 0
        movl	0+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r8d, %ecx
        addl	%r12d, %edx
        vpalignr	$12, %xmm0, %xmm1, %xmm5
        # iter_0: 1 - 1
        movl	16(%rsp), %eax
        addl	%ecx, %edx
        movl	(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm2, %xmm3, %xmm4
        # iter_0: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_0: 3 - 3
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r8d, %r15d
        movl	%r12d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_0: 4 - 4
        xorl	%r9d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        xorl	%r14d, %r11d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_0: 5 - 5
        addl	%ecx, %r15d
        addl	%edx, %r11d
        rorxl	$15, %r11d, %ebx
        vpalignr	$12, %xmm1, %xmm2, %xmm10
        # iter_0: 6 - 7
        rorxl	$23, %r11d, %eax
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        xorl	%eax, %r11d
        vpxor	%xmm10, %xmm0, %xmm10
        # iter_1: 0 - 0
        movl	4+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r15d, %ecx
        addl	%r11d, %edx
        vpshufd	$0xf9, %xmm3, %xmm4
        # iter_1: 1 - 2
        movl	20(%rsp), %eax
        addl	%ecx, %edx
        movl	4(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_1: 3 - 3
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r15d, %r14d
        movl	%r11d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_1: 4 - 4
        xorl	%r8d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        xorl	%r13d, %r10d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_1: 5 - 5
        addl	%ecx, %r14d
        addl	%edx, %r10d
        rorxl	$15, %r10d, %ebx
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_1: 6 - 6
        rorxl	$23, %r10d, %eax
        roll	$9, %r8d
        xorl	%ebx, %r10d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_1: 7 - 7
        roll	$19, %r12d
        xorl	%eax, %r10d
        # iter_2: 0 - 0
        movl	8+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r14d, %ecx
        addl	%r10d, %edx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_2: 1 - 1
        movl	24(%rsp), %eax
        addl	%ecx, %edx
        movl	8(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_2: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_2: 3 - 4
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r14d, %r13d
        movl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        xorl	%r12d, %r9d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_2: 5 - 5
        addl	%ecx, %r13d
        addl	%edx, %r9d
        rorxl	$15, %r9d, %ebx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_2: 6 - 6
        rorxl	$23, %r9d, %eax
        roll	$9, %r15d
        xorl	%ebx, %r9d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_2: 7 - 7
        roll	$19, %r11d
        xorl	%eax, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_3: 0 - 0
        movl	12+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r13d, %ecx
        addl	%r9d, %edx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_3: 1 - 1
        movl	28(%rsp), %eax
        addl	%ecx, %edx
        movl	12(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_3: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_3: 3 - 3
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r13d, %r12d
        movl	%r9d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_3: 4 - 4
        xorl	%r14d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        xorl	%r11d, %r8d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_3: 5 - 5
        addl	%ecx, %r12d
        addl	%edx, %r8d
        rorxl	$15, %r8d, %ebx
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm0
        # iter_3: 6 - 6
        rorxl	$23, %r8d, %eax
        roll	$9, %r14d
        xorl	%ebx, %r8d
        # iter_3: 7 - 7
        roll	$19, %r10d
        xorl	%eax, %r8d
        # msg_sched done: 0-3
        # msg_sched: 4-7
        # iter_4: 0 - 0
        movl	16+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r12d, %ecx
        addl	%r8d, %edx
        vpalignr	$12, %xmm1, %xmm2, %xmm5
        # iter_4: 1 - 1
        movl	32(%rsp), %eax
        addl	%ecx, %edx
        movl	16(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm3, %xmm0, %xmm4
        # iter_4: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_4: 3 - 3
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r12d, %r11d
        movl	%r8d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_4: 4 - 4
        xorl	%r13d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        xorl	%r10d, %r15d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_4: 5 - 5
        addl	%ecx, %r11d
        addl	%edx, %r15d
        rorxl	$15, %r15d, %ebx
        vpalignr	$12, %xmm2, %xmm3, %xmm10
        # iter_4: 6 - 7
        rorxl	$23, %r15d, %eax
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        xorl	%eax, %r15d
        vpxor	%xmm10, %xmm1, %xmm10
        # iter_5: 0 - 0
        movl	20+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r11d, %ecx
        addl	%r15d, %edx
        vpshufd	$0xf9, %xmm0, %xmm4
        # iter_5: 1 - 2
        movl	36(%rsp), %eax
        addl	%ecx, %edx
        movl	20(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_5: 3 - 3
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r11d, %r10d
        movl	%r15d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_5: 4 - 4
        xorl	%r12d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        xorl	%r9d, %r14d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_5: 5 - 5
        addl	%ecx, %r10d
        addl	%edx, %r14d
        rorxl	$15, %r14d, %ebx
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_5: 6 - 6
        rorxl	$23, %r14d, %eax
        roll	$9, %r12d
        xorl	%ebx, %r14d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_5: 7 - 7
        roll	$19, %r8d
        xorl	%eax, %r14d
        # iter_6: 0 - 0
        movl	24+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r10d, %ecx
        addl	%r14d, %edx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_6: 1 - 1
        movl	40(%rsp), %eax
        addl	%ecx, %edx
        movl	24(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_6: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_6: 3 - 4
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r10d, %r9d
        movl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        xorl	%r8d, %r13d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_6: 5 - 5
        addl	%ecx, %r9d
        addl	%edx, %r13d
        rorxl	$15, %r13d, %ebx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_6: 6 - 6
        rorxl	$23, %r13d, %eax
        roll	$9, %r11d
        xorl	%ebx, %r13d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_6: 7 - 7
        roll	$19, %r15d
        xorl	%eax, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_7: 0 - 0
        movl	28+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r9d, %ecx
        addl	%r13d, %edx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_7: 1 - 1
        movl	44(%rsp), %eax
        addl	%ecx, %edx
        movl	28(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_7: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_7: 3 - 3
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r9d, %r8d
        movl	%r13d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_7: 4 - 4
        xorl	%r10d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        xorl	%r15d, %r12d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_7: 5 - 5
        addl	%ecx, %r8d
        addl	%edx, %r12d
        rorxl	$15, %r12d, %ebx
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm1
        # iter_7: 6 - 6
        rorxl	$23, %r12d, %eax
        roll	$9, %r10d
        xorl	%ebx, %r12d
        # iter_7: 7 - 7
        roll	$19, %r14d
        xorl	%eax, %r12d
        # msg_sched done: 4-7
        # x2_to_w: 16
        vmovdqu	%xmm0, 64(%rsp)
        vmovdqu	%xmm1, 80(%rsp)
        # msg_sched: 8-11
        # iter_8: 0 - 0
        movl	32+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r8d, %ecx
        addl	%r12d, %edx
        vpalignr	$12, %xmm2, %xmm3, %xmm5
        # iter_8: 1 - 1
        movl	48(%rsp), %eax
        addl	%ecx, %edx
        movl	32(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm0, %xmm1, %xmm4
        # iter_8: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_8: 3 - 3
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r8d, %r15d
        movl	%r12d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_8: 4 - 4
        xorl	%r9d, %r15d
        xorl	%r13d, %r11d
        xorl	%r10d, %r15d
        xorl	%r14d, %r11d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_8: 5 - 5
        addl	%ecx, %r15d
        addl	%edx, %r11d
        rorxl	$15, %r11d, %ebx
        vpalignr	$12, %xmm3, %xmm0, %xmm10
        # iter_8: 6 - 7
        rorxl	$23, %r11d, %eax
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        xorl	%eax, %r11d
        vpxor	%xmm10, %xmm2, %xmm10
        # iter_9: 0 - 0
        movl	36+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r15d, %ecx
        addl	%r11d, %edx
        vpshufd	$0xf9, %xmm1, %xmm4
        # iter_9: 1 - 2
        movl	52(%rsp), %eax
        addl	%ecx, %edx
        movl	36(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_9: 3 - 3
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r15d, %r14d
        movl	%r11d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_9: 4 - 4
        xorl	%r8d, %r14d
        xorl	%r12d, %r10d
        xorl	%r9d, %r14d
        xorl	%r13d, %r10d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_9: 5 - 5
        addl	%ecx, %r14d
        addl	%edx, %r10d
        rorxl	$15, %r10d, %ebx
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_9: 6 - 6
        rorxl	$23, %r10d, %eax
        roll	$9, %r8d
        xorl	%ebx, %r10d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_9: 7 - 7
        roll	$19, %r12d
        xorl	%eax, %r10d
        # iter_10: 0 - 0
        movl	40+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r14d, %ecx
        addl	%r10d, %edx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_10: 1 - 1
        movl	56(%rsp), %eax
        addl	%ecx, %edx
        movl	40(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_10: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_10: 3 - 4
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r14d, %r13d
        movl	%r10d, %r9d
        xorl	%r15d, %r13d
        xorl	%r11d, %r9d
        xorl	%r8d, %r13d
        xorl	%r12d, %r9d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_10: 5 - 5
        addl	%ecx, %r13d
        addl	%edx, %r9d
        rorxl	$15, %r9d, %ebx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_10: 6 - 6
        rorxl	$23, %r9d, %eax
        roll	$9, %r15d
        xorl	%ebx, %r9d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_10: 7 - 7
        roll	$19, %r11d
        xorl	%eax, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_11: 0 - 0
        movl	44+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r13d, %ecx
        addl	%r9d, %edx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_11: 1 - 1
        movl	60(%rsp), %eax
        addl	%ecx, %edx
        movl	44(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_11: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_11: 3 - 3
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r13d, %r12d
        movl	%r9d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_11: 4 - 4
        xorl	%r14d, %r12d
        xorl	%r10d, %r8d
        xorl	%r15d, %r12d
        xorl	%r11d, %r8d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_11: 5 - 5
        addl	%ecx, %r12d
        addl	%edx, %r8d
        rorxl	$15, %r8d, %ebx
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm2
        # iter_11: 6 - 6
        rorxl	$23, %r8d, %eax
        roll	$9, %r14d
        xorl	%ebx, %r8d
        # iter_11: 7 - 7
        roll	$19, %r10d
        xorl	%eax, %r8d
        # msg_sched done: 8-11
        # msg_sched: 12-15
        # iter_12: 0 - 0
        movl	48+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r12d, %ecx
        addl	%r8d, %edx
        vpalignr	$12, %xmm3, %xmm0, %xmm5
        # iter_12: 1 - 1
        movl	64(%rsp), %eax
        addl	%ecx, %edx
        movl	48(%rsp), %ebx
        roll	$7, %edx
        vpalignr	$8, %xmm1, %xmm2, %xmm4
        # iter_12: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_12: 3 - 3
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r12d, %r11d
        movl	%r8d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_12: 4 - 4
        xorl	%r13d, %r11d
        xorl	%r9d, %r15d
        xorl	%r14d, %r11d
        xorl	%r10d, %r15d
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_12: 5 - 5
        addl	%ecx, %r11d
        addl	%edx, %r15d
        rorxl	$15, %r15d, %ebx
        vpalignr	$12, %xmm0, %xmm1, %xmm10
        # iter_12: 6 - 7
        rorxl	$23, %r15d, %eax
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        xorl	%eax, %r15d
        vpxor	%xmm10, %xmm3, %xmm10
        # iter_13: 0 - 0
        movl	52+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r11d, %ecx
        addl	%r15d, %edx
        vpshufd	$0xf9, %xmm2, %xmm4
        # iter_13: 1 - 2
        movl	68(%rsp), %eax
        addl	%ecx, %edx
        movl	52(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_13: 3 - 3
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r11d, %r10d
        movl	%r15d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_13: 4 - 4
        xorl	%r12d, %r10d
        xorl	%r8d, %r14d
        xorl	%r13d, %r10d
        xorl	%r9d, %r14d
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_13: 5 - 5
        addl	%ecx, %r10d
        addl	%edx, %r14d
        rorxl	$15, %r14d, %ebx
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_13: 6 - 6
        rorxl	$23, %r14d, %eax
        roll	$9, %r12d
        xorl	%ebx, %r14d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_13: 7 - 7
        roll	$19, %r8d
        xorl	%eax, %r14d
        # iter_14: 0 - 0
        movl	56+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r10d, %ecx
        addl	%r14d, %edx
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_14: 1 - 1
        movl	72(%rsp), %eax
        addl	%ecx, %edx
        movl	56(%rsp), %ebx
        roll	$7, %edx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_14: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_14: 3 - 4
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r10d, %r9d
        movl	%r14d, %r13d
        xorl	%r11d, %r9d
        xorl	%r15d, %r13d
        xorl	%r12d, %r9d
        xorl	%r8d, %r13d
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_14: 5 - 5
        addl	%ecx, %r9d
        addl	%edx, %r13d
        rorxl	$15, %r13d, %ebx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_14: 6 - 6
        rorxl	$23, %r13d, %eax
        roll	$9, %r11d
        xorl	%ebx, %r13d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_14: 7 - 7
        roll	$19, %r15d
        xorl	%eax, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_15: 0 - 0
        movl	60+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r9d, %ecx
        addl	%r13d, %edx
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_15: 1 - 1
        movl	76(%rsp), %eax
        addl	%ecx, %edx
        movl	60(%rsp), %ebx
        roll	$7, %edx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_15: 2 - 2
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_15: 3 - 3
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r9d, %r8d
        movl	%r13d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_15: 4 - 4
        xorl	%r10d, %r8d
        xorl	%r14d, %r12d
        xorl	%r11d, %r8d
        xorl	%r15d, %r12d
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_15: 5 - 5
        addl	%ecx, %r8d
        addl	%edx, %r12d
        rorxl	$15, %r12d, %ebx
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm3
        # iter_15: 6 - 6
        rorxl	$23, %r12d, %eax
        roll	$9, %r10d
        xorl	%ebx, %r12d
        # iter_15: 7 - 7
        roll	$19, %r14d
        xorl	%eax, %r12d
        # msg_sched done: 12-15
        # x2_to_w: 24
        vmovdqu	%xmm2, 96(%rsp)
        vmovdqu	%xmm3, 112(%rsp)
        # msg_sched: 16-19
        # iter_16: 0 - 0
        movl	64+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r8d, %ecx
        addl	%r12d, %edx
        movl	80(%rsp), %eax
        vpalignr	$12, %xmm0, %xmm1, %xmm5
        # iter_16: 1 - 1
        addl	%ecx, %edx
        movl	64(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpalignr	$8, %xmm2, %xmm3, %xmm4
        # iter_16: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r15d, %edx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_16: 3 - 3
        addl	%r11d, %ecx
        movl	%r9d, %r15d
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_16: 4 - 4
        xorl	%r10d, %ebx
        andn	%r14, %r12, %r11
        andl	%ebx, %r15d
        movl	%r12d, %ebx
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_16: 5 - 5
        xorl	%r9d, %r15d
        andl	%r13d, %ebx
        addl	%ecx, %r15d
        orl	%ebx, %r11d
        vpalignr	$12, %xmm1, %xmm2, %xmm10
        # iter_16: 6 - 7
        addl	%edx, %r11d
        rorxl	$15, %r11d, %ebx
        rorxl	$23, %r11d, %eax
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        xorl	%eax, %r11d
        vpxor	%xmm10, %xmm0, %xmm10
        # iter_17: 0 - 0
        movl	68+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r15d, %ecx
        addl	%r11d, %edx
        movl	84(%rsp), %eax
        vpshufd	$0xf9, %xmm3, %xmm4
        # iter_17: 1 - 2
        addl	%ecx, %edx
        movl	68(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r14d, %edx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_17: 3 - 3
        addl	%r10d, %ecx
        movl	%r8d, %r14d
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_17: 4 - 4
        xorl	%r9d, %ebx
        andn	%r13, %r11, %r10
        andl	%ebx, %r14d
        movl	%r11d, %ebx
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_17: 5 - 5
        xorl	%r8d, %r14d
        andl	%r12d, %ebx
        addl	%ecx, %r14d
        orl	%ebx, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_17: 6 - 6
        addl	%edx, %r10d
        rorxl	$15, %r10d, %ebx
        rorxl	$23, %r10d, %eax
        roll	$9, %r8d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_17: 7 - 7
        xorl	%ebx, %r10d
        roll	$19, %r12d
        xorl	%eax, %r10d
        # iter_18: 0 - 0
        movl	72+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r14d, %ecx
        addl	%r10d, %edx
        movl	88(%rsp), %eax
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_18: 1 - 1
        addl	%ecx, %edx
        movl	72(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_18: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r13d, %edx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_18: 3 - 4
        addl	%r9d, %ecx
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        andn	%r12, %r10, %r9
        andl	%ebx, %r13d
        movl	%r10d, %ebx
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_18: 5 - 5
        xorl	%r15d, %r13d
        andl	%r11d, %ebx
        addl	%ecx, %r13d
        orl	%ebx, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_18: 6 - 6
        addl	%edx, %r9d
        rorxl	$15, %r9d, %ebx
        rorxl	$23, %r9d, %eax
        roll	$9, %r15d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_18: 7 - 7
        xorl	%ebx, %r9d
        roll	$19, %r11d
        xorl	%eax, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_19: 0 - 0
        movl	76+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r13d, %ecx
        addl	%r9d, %edx
        movl	92(%rsp), %eax
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_19: 1 - 1
        addl	%ecx, %edx
        movl	76(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_19: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r12d, %edx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_19: 3 - 3
        addl	%r8d, %ecx
        movl	%r14d, %r12d
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_19: 4 - 4
        xorl	%r15d, %ebx
        andn	%r11, %r9, %r8
        andl	%ebx, %r12d
        movl	%r9d, %ebx
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_19: 5 - 5
        xorl	%r14d, %r12d
        andl	%r10d, %ebx
        addl	%ecx, %r12d
        orl	%ebx, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm0
        # iter_19: 6 - 6
        addl	%edx, %r8d
        rorxl	$15, %r8d, %ebx
        rorxl	$23, %r8d, %eax
        roll	$9, %r14d
        # iter_19: 7 - 7
        xorl	%ebx, %r8d
        roll	$19, %r10d
        xorl	%eax, %r8d
        # msg_sched done: 16-19
        # msg_sched: 20-23
        # iter_20: 0 - 0
        movl	80+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r12d, %ecx
        addl	%r8d, %edx
        movl	96(%rsp), %eax
        vpalignr	$12, %xmm1, %xmm2, %xmm5
        # iter_20: 1 - 1
        addl	%ecx, %edx
        movl	80(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpalignr	$8, %xmm3, %xmm0, %xmm4
        # iter_20: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r11d, %edx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_20: 3 - 3
        addl	%r15d, %ecx
        movl	%r13d, %r11d
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_20: 4 - 4
        xorl	%r14d, %ebx
        andn	%r10, %r8, %r15
        andl	%ebx, %r11d
        movl	%r8d, %ebx
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_20: 5 - 5
        xorl	%r13d, %r11d
        andl	%r9d, %ebx
        addl	%ecx, %r11d
        orl	%ebx, %r15d
        vpalignr	$12, %xmm2, %xmm3, %xmm10
        # iter_20: 6 - 7
        addl	%edx, %r15d
        rorxl	$15, %r15d, %ebx
        rorxl	$23, %r15d, %eax
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        xorl	%eax, %r15d
        vpxor	%xmm10, %xmm1, %xmm10
        # iter_21: 0 - 0
        movl	84+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r11d, %ecx
        addl	%r15d, %edx
        movl	100(%rsp), %eax
        vpshufd	$0xf9, %xmm0, %xmm4
        # iter_21: 1 - 2
        addl	%ecx, %edx
        movl	84(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r10d, %edx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_21: 3 - 3
        addl	%r14d, %ecx
        movl	%r12d, %r10d
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_21: 4 - 4
        xorl	%r13d, %ebx
        andn	%r9, %r15, %r14
        andl	%ebx, %r10d
        movl	%r15d, %ebx
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_21: 5 - 5
        xorl	%r12d, %r10d
        andl	%r8d, %ebx
        addl	%ecx, %r10d
        orl	%ebx, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_21: 6 - 6
        addl	%edx, %r14d
        rorxl	$15, %r14d, %ebx
        rorxl	$23, %r14d, %eax
        roll	$9, %r12d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_21: 7 - 7
        xorl	%ebx, %r14d
        roll	$19, %r8d
        xorl	%eax, %r14d
        # iter_22: 0 - 0
        movl	88+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r10d, %ecx
        addl	%r14d, %edx
        movl	104(%rsp), %eax
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_22: 1 - 1
        addl	%ecx, %edx
        movl	88(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_22: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r9d, %edx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_22: 3 - 4
        addl	%r13d, %ecx
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        andn	%r8, %r14, %r13
        andl	%ebx, %r9d
        movl	%r14d, %ebx
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_22: 5 - 5
        xorl	%r11d, %r9d
        andl	%r15d, %ebx
        addl	%ecx, %r9d
        orl	%ebx, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_22: 6 - 6
        addl	%edx, %r13d
        rorxl	$15, %r13d, %ebx
        rorxl	$23, %r13d, %eax
        roll	$9, %r11d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_22: 7 - 7
        xorl	%ebx, %r13d
        roll	$19, %r15d
        xorl	%eax, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_23: 0 - 0
        movl	92+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r9d, %ecx
        addl	%r13d, %edx
        movl	108(%rsp), %eax
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_23: 1 - 1
        addl	%ecx, %edx
        movl	92(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_23: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r8d, %edx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_23: 3 - 3
        addl	%r12d, %ecx
        movl	%r10d, %r8d
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_23: 4 - 4
        xorl	%r11d, %ebx
        andn	%r15, %r13, %r12
        andl	%ebx, %r8d
        movl	%r13d, %ebx
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_23: 5 - 5
        xorl	%r10d, %r8d
        andl	%r14d, %ebx
        addl	%ecx, %r8d
        orl	%ebx, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm1
        # iter_23: 6 - 6
        addl	%edx, %r12d
        rorxl	$15, %r12d, %ebx
        rorxl	$23, %r12d, %eax
        roll	$9, %r10d
        # iter_23: 7 - 7
        xorl	%ebx, %r12d
        roll	$19, %r14d
        xorl	%eax, %r12d
        # msg_sched done: 20-23
        # x2_to_w: 32
        vmovdqu	%xmm0, 128(%rsp)
        vmovdqu	%xmm1, 144(%rsp)
        # msg_sched: 24-27
        # iter_24: 0 - 0
        movl	96+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r8d, %ecx
        addl	%r12d, %edx
        movl	112(%rsp), %eax
        vpalignr	$12, %xmm2, %xmm3, %xmm5
        # iter_24: 1 - 1
        addl	%ecx, %edx
        movl	96(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpalignr	$8, %xmm0, %xmm1, %xmm4
        # iter_24: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r15d, %edx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_24: 3 - 3
        addl	%r11d, %ecx
        movl	%r9d, %r15d
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_24: 4 - 4
        xorl	%r10d, %ebx
        andn	%r14, %r12, %r11
        andl	%ebx, %r15d
        movl	%r12d, %ebx
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_24: 5 - 5
        xorl	%r9d, %r15d
        andl	%r13d, %ebx
        addl	%ecx, %r15d
        orl	%ebx, %r11d
        vpalignr	$12, %xmm3, %xmm0, %xmm10
        # iter_24: 6 - 7
        addl	%edx, %r11d
        rorxl	$15, %r11d, %ebx
        rorxl	$23, %r11d, %eax
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        xorl	%eax, %r11d
        vpxor	%xmm10, %xmm2, %xmm10
        # iter_25: 0 - 0
        movl	100+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r15d, %ecx
        addl	%r11d, %edx
        movl	116(%rsp), %eax
        vpshufd	$0xf9, %xmm1, %xmm4
        # iter_25: 1 - 2
        addl	%ecx, %edx
        movl	100(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r14d, %edx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_25: 3 - 3
        addl	%r10d, %ecx
        movl	%r8d, %r14d
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_25: 4 - 4
        xorl	%r9d, %ebx
        andn	%r13, %r11, %r10
        andl	%ebx, %r14d
        movl	%r11d, %ebx
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_25: 5 - 5
        xorl	%r8d, %r14d
        andl	%r12d, %ebx
        addl	%ecx, %r14d
        orl	%ebx, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_25: 6 - 6
        addl	%edx, %r10d
        rorxl	$15, %r10d, %ebx
        rorxl	$23, %r10d, %eax
        roll	$9, %r8d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_25: 7 - 7
        xorl	%ebx, %r10d
        roll	$19, %r12d
        xorl	%eax, %r10d
        # iter_26: 0 - 0
        movl	104+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r14d, %ecx
        addl	%r10d, %edx
        movl	120(%rsp), %eax
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_26: 1 - 1
        addl	%ecx, %edx
        movl	104(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_26: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r13d, %edx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_26: 3 - 4
        addl	%r9d, %ecx
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        andn	%r12, %r10, %r9
        andl	%ebx, %r13d
        movl	%r10d, %ebx
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_26: 5 - 5
        xorl	%r15d, %r13d
        andl	%r11d, %ebx
        addl	%ecx, %r13d
        orl	%ebx, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_26: 6 - 6
        addl	%edx, %r9d
        rorxl	$15, %r9d, %ebx
        rorxl	$23, %r9d, %eax
        roll	$9, %r15d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_26: 7 - 7
        xorl	%ebx, %r9d
        roll	$19, %r11d
        xorl	%eax, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_27: 0 - 0
        movl	108+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r13d, %ecx
        addl	%r9d, %edx
        movl	124(%rsp), %eax
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_27: 1 - 1
        addl	%ecx, %edx
        movl	108(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_27: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r12d, %edx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_27: 3 - 3
        addl	%r8d, %ecx
        movl	%r14d, %r12d
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_27: 4 - 4
        xorl	%r15d, %ebx
        andn	%r11, %r9, %r8
        andl	%ebx, %r12d
        movl	%r9d, %ebx
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_27: 5 - 5
        xorl	%r14d, %r12d
        andl	%r10d, %ebx
        addl	%ecx, %r12d
        orl	%ebx, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm2
        # iter_27: 6 - 6
        addl	%edx, %r8d
        rorxl	$15, %r8d, %ebx
        rorxl	$23, %r8d, %eax
        roll	$9, %r14d
        # iter_27: 7 - 7
        xorl	%ebx, %r8d
        roll	$19, %r10d
        xorl	%eax, %r8d
        # msg_sched done: 24-27
        # msg_sched: 28-31
        # iter_28: 0 - 0
        movl	112+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r12d, %ecx
        addl	%r8d, %edx
        movl	128(%rsp), %eax
        vpalignr	$12, %xmm3, %xmm0, %xmm5
        # iter_28: 1 - 1
        addl	%ecx, %edx
        movl	112(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpalignr	$8, %xmm1, %xmm2, %xmm4
        # iter_28: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r11d, %edx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_28: 3 - 3
        addl	%r15d, %ecx
        movl	%r13d, %r11d
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_28: 4 - 4
        xorl	%r14d, %ebx
        andn	%r10, %r8, %r15
        andl	%ebx, %r11d
        movl	%r8d, %ebx
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_28: 5 - 5
        xorl	%r13d, %r11d
        andl	%r9d, %ebx
        addl	%ecx, %r11d
        orl	%ebx, %r15d
        vpalignr	$12, %xmm0, %xmm1, %xmm10
        # iter_28: 6 - 7
        addl	%edx, %r15d
        rorxl	$15, %r15d, %ebx
        rorxl	$23, %r15d, %eax
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        xorl	%eax, %r15d
        vpxor	%xmm10, %xmm3, %xmm10
        # iter_29: 0 - 0
        movl	116+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r11d, %ecx
        addl	%r15d, %edx
        movl	132(%rsp), %eax
        vpshufd	$0xf9, %xmm2, %xmm4
        # iter_29: 1 - 2
        addl	%ecx, %edx
        movl	116(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r10d, %edx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_29: 3 - 3
        addl	%r14d, %ecx
        movl	%r12d, %r10d
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_29: 4 - 4
        xorl	%r13d, %ebx
        andn	%r9, %r15, %r14
        andl	%ebx, %r10d
        movl	%r15d, %ebx
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_29: 5 - 5
        xorl	%r12d, %r10d
        andl	%r8d, %ebx
        addl	%ecx, %r10d
        orl	%ebx, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_29: 6 - 6
        addl	%edx, %r14d
        rorxl	$15, %r14d, %ebx
        rorxl	$23, %r14d, %eax
        roll	$9, %r12d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_29: 7 - 7
        xorl	%ebx, %r14d
        roll	$19, %r8d
        xorl	%eax, %r14d
        # iter_30: 0 - 0
        movl	120+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r10d, %ecx
        addl	%r14d, %edx
        movl	136(%rsp), %eax
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_30: 1 - 1
        addl	%ecx, %edx
        movl	120(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_30: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r9d, %edx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_30: 3 - 4
        addl	%r13d, %ecx
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        andn	%r8, %r14, %r13
        andl	%ebx, %r9d
        movl	%r14d, %ebx
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_30: 5 - 5
        xorl	%r11d, %r9d
        andl	%r15d, %ebx
        addl	%ecx, %r9d
        orl	%ebx, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_30: 6 - 6
        addl	%edx, %r13d
        rorxl	$15, %r13d, %ebx
        rorxl	$23, %r13d, %eax
        roll	$9, %r11d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_30: 7 - 7
        xorl	%ebx, %r13d
        roll	$19, %r15d
        xorl	%eax, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_31: 0 - 0
        movl	124+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r9d, %ecx
        addl	%r13d, %edx
        movl	140(%rsp), %eax
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_31: 1 - 1
        addl	%ecx, %edx
        movl	124(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_31: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r8d, %edx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_31: 3 - 3
        addl	%r12d, %ecx
        movl	%r10d, %r8d
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_31: 4 - 4
        xorl	%r11d, %ebx
        andn	%r15, %r13, %r12
        andl	%ebx, %r8d
        movl	%r13d, %ebx
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_31: 5 - 5
        xorl	%r10d, %r8d
        andl	%r14d, %ebx
        addl	%ecx, %r8d
        orl	%ebx, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm3
        # iter_31: 6 - 6
        addl	%edx, %r12d
        rorxl	$15, %r12d, %ebx
        rorxl	$23, %r12d, %eax
        roll	$9, %r10d
        # iter_31: 7 - 7
        xorl	%ebx, %r12d
        roll	$19, %r14d
        xorl	%eax, %r12d
        # msg_sched done: 28-31
        # x2_to_w: 40
        vmovdqu	%xmm2, 160(%rsp)
        vmovdqu	%xmm3, 176(%rsp)
        # msg_sched: 32-35
        # iter_32: 0 - 0
        movl	128+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r8d, %ecx
        addl	%r12d, %edx
        movl	144(%rsp), %eax
        vpalignr	$12, %xmm0, %xmm1, %xmm5
        # iter_32: 1 - 1
        addl	%ecx, %edx
        movl	128(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpalignr	$8, %xmm2, %xmm3, %xmm4
        # iter_32: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r15d, %edx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_32: 3 - 3
        addl	%r11d, %ecx
        movl	%r9d, %r15d
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_32: 4 - 4
        xorl	%r10d, %ebx
        andn	%r14, %r12, %r11
        andl	%ebx, %r15d
        movl	%r12d, %ebx
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_32: 5 - 5
        xorl	%r9d, %r15d
        andl	%r13d, %ebx
        addl	%ecx, %r15d
        orl	%ebx, %r11d
        vpalignr	$12, %xmm1, %xmm2, %xmm10
        # iter_32: 6 - 7
        addl	%edx, %r11d
        rorxl	$15, %r11d, %ebx
        rorxl	$23, %r11d, %eax
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        xorl	%eax, %r11d
        vpxor	%xmm10, %xmm0, %xmm10
        # iter_33: 0 - 0
        movl	132+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r15d, %ecx
        addl	%r11d, %edx
        movl	148(%rsp), %eax
        vpshufd	$0xf9, %xmm3, %xmm4
        # iter_33: 1 - 2
        addl	%ecx, %edx
        movl	132(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r14d, %edx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_33: 3 - 3
        addl	%r10d, %ecx
        movl	%r8d, %r14d
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_33: 4 - 4
        xorl	%r9d, %ebx
        andn	%r13, %r11, %r10
        andl	%ebx, %r14d
        movl	%r11d, %ebx
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_33: 5 - 5
        xorl	%r8d, %r14d
        andl	%r12d, %ebx
        addl	%ecx, %r14d
        orl	%ebx, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_33: 6 - 6
        addl	%edx, %r10d
        rorxl	$15, %r10d, %ebx
        rorxl	$23, %r10d, %eax
        roll	$9, %r8d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_33: 7 - 7
        xorl	%ebx, %r10d
        roll	$19, %r12d
        xorl	%eax, %r10d
        # iter_34: 0 - 0
        movl	136+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r14d, %ecx
        addl	%r10d, %edx
        movl	152(%rsp), %eax
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_34: 1 - 1
        addl	%ecx, %edx
        movl	136(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_34: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r13d, %edx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_34: 3 - 4
        addl	%r9d, %ecx
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        andn	%r12, %r10, %r9
        andl	%ebx, %r13d
        movl	%r10d, %ebx
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_34: 5 - 5
        xorl	%r15d, %r13d
        andl	%r11d, %ebx
        addl	%ecx, %r13d
        orl	%ebx, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_34: 6 - 6
        addl	%edx, %r9d
        rorxl	$15, %r9d, %ebx
        rorxl	$23, %r9d, %eax
        roll	$9, %r15d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_34: 7 - 7
        xorl	%ebx, %r9d
        roll	$19, %r11d
        xorl	%eax, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_35: 0 - 0
        movl	140+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r13d, %ecx
        addl	%r9d, %edx
        movl	156(%rsp), %eax
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_35: 1 - 1
        addl	%ecx, %edx
        movl	140(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_35: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r12d, %edx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_35: 3 - 3
        addl	%r8d, %ecx
        movl	%r14d, %r12d
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_35: 4 - 4
        xorl	%r15d, %ebx
        andn	%r11, %r9, %r8
        andl	%ebx, %r12d
        movl	%r9d, %ebx
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_35: 5 - 5
        xorl	%r14d, %r12d
        andl	%r10d, %ebx
        addl	%ecx, %r12d
        orl	%ebx, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm0
        # iter_35: 6 - 6
        addl	%edx, %r8d
        rorxl	$15, %r8d, %ebx
        rorxl	$23, %r8d, %eax
        roll	$9, %r14d
        # iter_35: 7 - 7
        xorl	%ebx, %r8d
        roll	$19, %r10d
        xorl	%eax, %r8d
        # msg_sched done: 32-35
        # msg_sched: 36-39
        # iter_36: 0 - 0
        movl	144+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r12d, %ecx
        addl	%r8d, %edx
        movl	160(%rsp), %eax
        vpalignr	$12, %xmm1, %xmm2, %xmm5
        # iter_36: 1 - 1
        addl	%ecx, %edx
        movl	144(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpalignr	$8, %xmm3, %xmm0, %xmm4
        # iter_36: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r11d, %edx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_36: 3 - 3
        addl	%r15d, %ecx
        movl	%r13d, %r11d
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_36: 4 - 4
        xorl	%r14d, %ebx
        andn	%r10, %r8, %r15
        andl	%ebx, %r11d
        movl	%r8d, %ebx
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_36: 5 - 5
        xorl	%r13d, %r11d
        andl	%r9d, %ebx
        addl	%ecx, %r11d
        orl	%ebx, %r15d
        vpalignr	$12, %xmm2, %xmm3, %xmm10
        # iter_36: 6 - 7
        addl	%edx, %r15d
        rorxl	$15, %r15d, %ebx
        rorxl	$23, %r15d, %eax
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        xorl	%eax, %r15d
        vpxor	%xmm10, %xmm1, %xmm10
        # iter_37: 0 - 0
        movl	148+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r11d, %ecx
        addl	%r15d, %edx
        movl	164(%rsp), %eax
        vpshufd	$0xf9, %xmm0, %xmm4
        # iter_37: 1 - 2
        addl	%ecx, %edx
        movl	148(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r10d, %edx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_37: 3 - 3
        addl	%r14d, %ecx
        movl	%r12d, %r10d
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_37: 4 - 4
        xorl	%r13d, %ebx
        andn	%r9, %r15, %r14
        andl	%ebx, %r10d
        movl	%r15d, %ebx
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_37: 5 - 5
        xorl	%r12d, %r10d
        andl	%r8d, %ebx
        addl	%ecx, %r10d
        orl	%ebx, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_37: 6 - 6
        addl	%edx, %r14d
        rorxl	$15, %r14d, %ebx
        rorxl	$23, %r14d, %eax
        roll	$9, %r12d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_37: 7 - 7
        xorl	%ebx, %r14d
        roll	$19, %r8d
        xorl	%eax, %r14d
        # iter_38: 0 - 0
        movl	152+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r10d, %ecx
        addl	%r14d, %edx
        movl	168(%rsp), %eax
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_38: 1 - 1
        addl	%ecx, %edx
        movl	152(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_38: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r9d, %edx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_38: 3 - 4
        addl	%r13d, %ecx
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        andn	%r8, %r14, %r13
        andl	%ebx, %r9d
        movl	%r14d, %ebx
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_38: 5 - 5
        xorl	%r11d, %r9d
        andl	%r15d, %ebx
        addl	%ecx, %r9d
        orl	%ebx, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_38: 6 - 6
        addl	%edx, %r13d
        rorxl	$15, %r13d, %ebx
        rorxl	$23, %r13d, %eax
        roll	$9, %r11d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_38: 7 - 7
        xorl	%ebx, %r13d
        roll	$19, %r15d
        xorl	%eax, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_39: 0 - 0
        movl	156+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r9d, %ecx
        addl	%r13d, %edx
        movl	172(%rsp), %eax
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_39: 1 - 1
        addl	%ecx, %edx
        movl	156(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_39: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r8d, %edx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_39: 3 - 3
        addl	%r12d, %ecx
        movl	%r10d, %r8d
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_39: 4 - 4
        xorl	%r11d, %ebx
        andn	%r15, %r13, %r12
        andl	%ebx, %r8d
        movl	%r13d, %ebx
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_39: 5 - 5
        xorl	%r10d, %r8d
        andl	%r14d, %ebx
        addl	%ecx, %r8d
        orl	%ebx, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm1
        # iter_39: 6 - 6
        addl	%edx, %r12d
        rorxl	$15, %r12d, %ebx
        rorxl	$23, %r12d, %eax
        roll	$9, %r10d
        # iter_39: 7 - 7
        xorl	%ebx, %r12d
        roll	$19, %r14d
        xorl	%eax, %r12d
        # msg_sched done: 36-39
        # x2_to_w: 48
        vmovdqu	%xmm0, 192(%rsp)
        vmovdqu	%xmm1, 208(%rsp)
        # msg_sched: 40-43
        # iter_40: 0 - 0
        movl	160+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r8d, %ecx
        addl	%r12d, %edx
        movl	176(%rsp), %eax
        vpalignr	$12, %xmm2, %xmm3, %xmm5
        # iter_40: 1 - 1
        addl	%ecx, %edx
        movl	160(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpalignr	$8, %xmm0, %xmm1, %xmm4
        # iter_40: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r15d, %edx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_40: 3 - 3
        addl	%r11d, %ecx
        movl	%r9d, %r15d
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_40: 4 - 4
        xorl	%r10d, %ebx
        andn	%r14, %r12, %r11
        andl	%ebx, %r15d
        movl	%r12d, %ebx
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_40: 5 - 5
        xorl	%r9d, %r15d
        andl	%r13d, %ebx
        addl	%ecx, %r15d
        orl	%ebx, %r11d
        vpalignr	$12, %xmm3, %xmm0, %xmm10
        # iter_40: 6 - 7
        addl	%edx, %r11d
        rorxl	$15, %r11d, %ebx
        rorxl	$23, %r11d, %eax
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        xorl	%eax, %r11d
        vpxor	%xmm10, %xmm2, %xmm10
        # iter_41: 0 - 0
        movl	164+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r15d, %ecx
        addl	%r11d, %edx
        movl	180(%rsp), %eax
        vpshufd	$0xf9, %xmm1, %xmm4
        # iter_41: 1 - 2
        addl	%ecx, %edx
        movl	164(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r14d, %edx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_41: 3 - 3
        addl	%r10d, %ecx
        movl	%r8d, %r14d
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_41: 4 - 4
        xorl	%r9d, %ebx
        andn	%r13, %r11, %r10
        andl	%ebx, %r14d
        movl	%r11d, %ebx
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_41: 5 - 5
        xorl	%r8d, %r14d
        andl	%r12d, %ebx
        addl	%ecx, %r14d
        orl	%ebx, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_41: 6 - 6
        addl	%edx, %r10d
        rorxl	$15, %r10d, %ebx
        rorxl	$23, %r10d, %eax
        roll	$9, %r8d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_41: 7 - 7
        xorl	%ebx, %r10d
        roll	$19, %r12d
        xorl	%eax, %r10d
        # iter_42: 0 - 0
        movl	168+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r14d, %ecx
        addl	%r10d, %edx
        movl	184(%rsp), %eax
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_42: 1 - 1
        addl	%ecx, %edx
        movl	168(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_42: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r13d, %edx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_42: 3 - 4
        addl	%r9d, %ecx
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        andn	%r12, %r10, %r9
        andl	%ebx, %r13d
        movl	%r10d, %ebx
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_42: 5 - 5
        xorl	%r15d, %r13d
        andl	%r11d, %ebx
        addl	%ecx, %r13d
        orl	%ebx, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_42: 6 - 6
        addl	%edx, %r9d
        rorxl	$15, %r9d, %ebx
        rorxl	$23, %r9d, %eax
        roll	$9, %r15d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_42: 7 - 7
        xorl	%ebx, %r9d
        roll	$19, %r11d
        xorl	%eax, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_43: 0 - 0
        movl	172+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r13d, %ecx
        addl	%r9d, %edx
        movl	188(%rsp), %eax
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_43: 1 - 1
        addl	%ecx, %edx
        movl	172(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_43: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r12d, %edx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_43: 3 - 3
        addl	%r8d, %ecx
        movl	%r14d, %r12d
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_43: 4 - 4
        xorl	%r15d, %ebx
        andn	%r11, %r9, %r8
        andl	%ebx, %r12d
        movl	%r9d, %ebx
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_43: 5 - 5
        xorl	%r14d, %r12d
        andl	%r10d, %ebx
        addl	%ecx, %r12d
        orl	%ebx, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm2
        # iter_43: 6 - 6
        addl	%edx, %r8d
        rorxl	$15, %r8d, %ebx
        rorxl	$23, %r8d, %eax
        roll	$9, %r14d
        # iter_43: 7 - 7
        xorl	%ebx, %r8d
        roll	$19, %r10d
        xorl	%eax, %r8d
        # msg_sched done: 40-43
        # msg_sched: 44-47
        # iter_44: 0 - 0
        movl	176+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r12d, %ecx
        addl	%r8d, %edx
        movl	192(%rsp), %eax
        vpalignr	$12, %xmm3, %xmm0, %xmm5
        # iter_44: 1 - 1
        addl	%ecx, %edx
        movl	176(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpalignr	$8, %xmm1, %xmm2, %xmm4
        # iter_44: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r11d, %edx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_44: 3 - 3
        addl	%r15d, %ecx
        movl	%r13d, %r11d
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_44: 4 - 4
        xorl	%r14d, %ebx
        andn	%r10, %r8, %r15
        andl	%ebx, %r11d
        movl	%r8d, %ebx
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_44: 5 - 5
        xorl	%r13d, %r11d
        andl	%r9d, %ebx
        addl	%ecx, %r11d
        orl	%ebx, %r15d
        vpalignr	$12, %xmm0, %xmm1, %xmm10
        # iter_44: 6 - 7
        addl	%edx, %r15d
        rorxl	$15, %r15d, %ebx
        rorxl	$23, %r15d, %eax
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        xorl	%eax, %r15d
        vpxor	%xmm10, %xmm3, %xmm10
        # iter_45: 0 - 0
        movl	180+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r11d, %ecx
        addl	%r15d, %edx
        movl	196(%rsp), %eax
        vpshufd	$0xf9, %xmm2, %xmm4
        # iter_45: 1 - 2
        addl	%ecx, %edx
        movl	180(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r10d, %edx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_45: 3 - 3
        addl	%r14d, %ecx
        movl	%r12d, %r10d
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_45: 4 - 4
        xorl	%r13d, %ebx
        andn	%r9, %r15, %r14
        andl	%ebx, %r10d
        movl	%r15d, %ebx
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_45: 5 - 5
        xorl	%r12d, %r10d
        andl	%r8d, %ebx
        addl	%ecx, %r10d
        orl	%ebx, %r14d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_45: 6 - 6
        addl	%edx, %r14d
        rorxl	$15, %r14d, %ebx
        rorxl	$23, %r14d, %eax
        roll	$9, %r12d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_45: 7 - 7
        xorl	%ebx, %r14d
        roll	$19, %r8d
        xorl	%eax, %r14d
        # iter_46: 0 - 0
        movl	184+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r10d, %ecx
        addl	%r14d, %edx
        movl	200(%rsp), %eax
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_46: 1 - 1
        addl	%ecx, %edx
        movl	184(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_46: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r9d, %edx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_46: 3 - 4
        addl	%r13d, %ecx
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        andn	%r8, %r14, %r13
        andl	%ebx, %r9d
        movl	%r14d, %ebx
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_46: 5 - 5
        xorl	%r11d, %r9d
        andl	%r15d, %ebx
        addl	%ecx, %r9d
        orl	%ebx, %r13d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_46: 6 - 6
        addl	%edx, %r13d
        rorxl	$15, %r13d, %ebx
        rorxl	$23, %r13d, %eax
        roll	$9, %r11d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_46: 7 - 7
        xorl	%ebx, %r13d
        roll	$19, %r15d
        xorl	%eax, %r13d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_47: 0 - 0
        movl	188+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r9d, %ecx
        addl	%r13d, %edx
        movl	204(%rsp), %eax
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_47: 1 - 1
        addl	%ecx, %edx
        movl	188(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_47: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r8d, %edx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_47: 3 - 3
        addl	%r12d, %ecx
        movl	%r10d, %r8d
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_47: 4 - 4
        xorl	%r11d, %ebx
        andn	%r15, %r13, %r12
        andl	%ebx, %r8d
        movl	%r13d, %ebx
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_47: 5 - 5
        xorl	%r10d, %r8d
        andl	%r14d, %ebx
        addl	%ecx, %r8d
        orl	%ebx, %r12d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm3
        # iter_47: 6 - 6
        addl	%edx, %r12d
        rorxl	$15, %r12d, %ebx
        rorxl	$23, %r12d, %eax
        roll	$9, %r10d
        # iter_47: 7 - 7
        xorl	%ebx, %r12d
        roll	$19, %r14d
        xorl	%eax, %r12d
        # msg_sched done: 44-47
        # x2_to_w: 56
        vmovdqu	%xmm2, 224(%rsp)
        vmovdqu	%xmm3, 240(%rsp)
        # msg_sched: 48-51
        # iter_48: 0 - 0
        movl	192+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r8d, %ecx
        addl	%r12d, %edx
        movl	208(%rsp), %eax
        vpalignr	$12, %xmm0, %xmm1, %xmm5
        # iter_48: 1 - 1
        addl	%ecx, %edx
        movl	192(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpalignr	$8, %xmm2, %xmm3, %xmm4
        # iter_48: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r15d, %edx
        vpslld	$7, %xmm5, %xmm8
        vpsrld	$25, %xmm5, %xmm9
        # iter_48: 3 - 3
        addl	%r11d, %ecx
        movl	%r9d, %r15d
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        vpor	%xmm9, %xmm8, %xmm9
        # iter_48: 4 - 4
        xorl	%r10d, %ebx
        andn	%r14, %r12, %r11
        andl	%ebx, %r15d
        movl	%r12d, %ebx
        vpxor	%xmm9, %xmm4, %xmm9
        # iter_48: 5 - 5
        xorl	%r9d, %r15d
        andl	%r13d, %ebx
        addl	%ecx, %r15d
        orl	%ebx, %r11d
        vpalignr	$12, %xmm1, %xmm2, %xmm10
        # iter_48: 6 - 7
        addl	%edx, %r11d
        rorxl	$15, %r11d, %ebx
        rorxl	$23, %r11d, %eax
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        xorl	%eax, %r11d
        vpxor	%xmm10, %xmm0, %xmm10
        # iter_49: 0 - 0
        movl	196+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r15d, %ecx
        addl	%r11d, %edx
        movl	212(%rsp), %eax
        vpshufd	$0xf9, %xmm3, %xmm4
        # iter_49: 1 - 2
        addl	%ecx, %edx
        movl	196(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r14d, %edx
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_49: 3 - 3
        addl	%r10d, %ecx
        movl	%r8d, %r14d
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_49: 4 - 4
        xorl	%r9d, %ebx
        andn	%r13, %r11, %r10
        andl	%ebx, %r14d
        movl	%r11d, %ebx
        vpxor	%xmm4, %xmm10, %xmm4
        # iter_49: 5 - 5
        xorl	%r8d, %r14d
        andl	%r12d, %ebx
        addl	%ecx, %r14d
        orl	%ebx, %r10d
        vpslld	$15, %xmm4, %xmm8
        vpsrld	$0x11, %xmm4, %xmm7
        # iter_49: 6 - 6
        addl	%edx, %r10d
        rorxl	$15, %r10d, %ebx
        rorxl	$23, %r10d, %eax
        roll	$9, %r8d
        vpslld	$23, %xmm4, %xmm6
        vpsrld	$9, %xmm4, %xmm5
        # iter_49: 7 - 7
        xorl	%ebx, %r10d
        roll	$19, %r12d
        xorl	%eax, %r10d
        # iter_50: 0 - 0
        movl	200+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r14d, %ecx
        addl	%r10d, %edx
        movl	216(%rsp), %eax
        vpor	%xmm8, %xmm7, %xmm8
        vpor	%xmm6, %xmm5, %xmm6
        # iter_50: 1 - 1
        addl	%ecx, %edx
        movl	200(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpxor	%xmm8, %xmm6, %xmm8
        vpxor	%xmm4, %xmm9, %xmm4
        # iter_50: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r13d, %edx
        vpxor	%xmm8, %xmm4, %xmm8
        # iter_50: 3 - 4
        addl	%r9d, %ecx
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        andn	%r12, %r10, %r9
        andl	%ebx, %r13d
        movl	%r10d, %ebx
        vpshufd	$0x00, %xmm8, %xmm4
        # iter_50: 5 - 5
        xorl	%r15d, %r13d
        andl	%r11d, %ebx
        addl	%ecx, %r13d
        orl	%ebx, %r9d
        vpslld	$15, %xmm4, %xmm5
        vpsrld	$0x11, %xmm4, %xmm4
        # iter_50: 6 - 6
        addl	%edx, %r9d
        rorxl	$15, %r9d, %ebx
        rorxl	$23, %r9d, %eax
        roll	$9, %r15d
        vpor	%xmm4, %xmm5, %xmm4
        # iter_50: 7 - 7
        xorl	%ebx, %r9d
        roll	$19, %r11d
        xorl	%eax, %r9d
        vpxor	%xmm10, %xmm4, %xmm10
        # iter_51: 0 - 0
        movl	204+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r13d, %ecx
        addl	%r9d, %edx
        movl	220(%rsp), %eax
        vpslld	$15, %xmm10, %xmm7
        vpsrld	$0x11, %xmm10, %xmm6
        # iter_51: 1 - 1
        addl	%ecx, %edx
        movl	204(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        vpslld	$23, %xmm10, %xmm5
        vpsrld	$9, %xmm10, %xmm4
        # iter_51: 2 - 2
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r12d, %edx
        vpor	%xmm7, %xmm6, %xmm7
        vpor	%xmm5, %xmm4, %xmm5
        # iter_51: 3 - 3
        addl	%r8d, %ecx
        movl	%r14d, %r12d
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        vpxor	%xmm10, %xmm7, %xmm10
        vpxor	%xmm5, %xmm9, %xmm5
        # iter_51: 4 - 4
        xorl	%r15d, %ebx
        andn	%r11, %r9, %r8
        andl	%ebx, %r12d
        movl	%r9d, %ebx
        vpxor	%xmm10, %xmm5, %xmm10
        # iter_51: 5 - 5
        xorl	%r14d, %r12d
        andl	%r10d, %ebx
        addl	%ecx, %r12d
        orl	%ebx, %r8d
        vpblendw	$0xc0, %xmm10, %xmm8, %xmm0
        # iter_51: 6 - 6
        addl	%edx, %r8d
        rorxl	$15, %r8d, %ebx
        rorxl	$23, %r8d, %eax
        roll	$9, %r14d
        # iter_51: 7 - 7
        xorl	%ebx, %r8d
        roll	$19, %r10d
        xorl	%eax, %r8d
        # msg_sched done: 48-51
        # iter_52: 0 - 7
        movl	208+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r12d, %ecx
        addl	%r8d, %edx
        movl	224(%rsp), %eax
        addl	%ecx, %edx
        movl	208(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r13d, %r11d
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        xorl	%r14d, %ebx
        andn	%r10, %r8, %r15
        andl	%ebx, %r11d
        movl	%r8d, %ebx
        xorl	%r13d, %r11d
        andl	%r9d, %ebx
        addl	%ecx, %r11d
        orl	%ebx, %r15d
        addl	%edx, %r15d
        rorxl	$15, %r15d, %ebx
        rorxl	$23, %r15d, %eax
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        xorl	%eax, %r15d
        # iter_53: 0 - 7
        movl	212+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r11d, %ecx
        addl	%r15d, %edx
        movl	228(%rsp), %eax
        addl	%ecx, %edx
        movl	212(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r12d, %r10d
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        xorl	%r13d, %ebx
        andn	%r9, %r15, %r14
        andl	%ebx, %r10d
        movl	%r15d, %ebx
        xorl	%r12d, %r10d
        andl	%r8d, %ebx
        addl	%ecx, %r10d
        orl	%ebx, %r14d
        addl	%edx, %r14d
        rorxl	$15, %r14d, %ebx
        rorxl	$23, %r14d, %eax
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        xorl	%eax, %r14d
        # iter_54: 0 - 7
        movl	216+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r10d, %ecx
        addl	%r14d, %edx
        movl	232(%rsp), %eax
        addl	%ecx, %edx
        movl	216(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        andn	%r8, %r14, %r13
        andl	%ebx, %r9d
        movl	%r14d, %ebx
        xorl	%r11d, %r9d
        andl	%r15d, %ebx
        addl	%ecx, %r9d
        orl	%ebx, %r13d
        addl	%edx, %r13d
        rorxl	$15, %r13d, %ebx
        rorxl	$23, %r13d, %eax
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        xorl	%eax, %r13d
        # iter_55: 0 - 7
        movl	220+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r9d, %ecx
        addl	%r13d, %edx
        movl	236(%rsp), %eax
        addl	%ecx, %edx
        movl	220(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r10d, %r8d
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        xorl	%r11d, %ebx
        andn	%r15, %r13, %r12
        andl	%ebx, %r8d
        movl	%r13d, %ebx
        xorl	%r10d, %r8d
        andl	%r14d, %ebx
        addl	%ecx, %r8d
        orl	%ebx, %r12d
        addl	%edx, %r12d
        rorxl	$15, %r12d, %ebx
        rorxl	$23, %r12d, %eax
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        xorl	%eax, %r12d
        # x0_to_w: 64
        vmovdqu	%xmm0, 256(%rsp)
        # iter_56: 0 - 7
        movl	224+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r8d, %ecx
        addl	%r12d, %edx
        movl	240(%rsp), %eax
        addl	%ecx, %edx
        movl	224(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r15d, %edx
        addl	%r11d, %ecx
        movl	%r9d, %r15d
        movl	%r9d, %ebx
        xorl	%r8d, %r15d
        xorl	%r10d, %ebx
        andn	%r14, %r12, %r11
        andl	%ebx, %r15d
        movl	%r12d, %ebx
        xorl	%r9d, %r15d
        andl	%r13d, %ebx
        addl	%ecx, %r15d
        orl	%ebx, %r11d
        addl	%edx, %r11d
        rorxl	$15, %r11d, %ebx
        rorxl	$23, %r11d, %eax
        roll	$9, %r9d
        xorl	%ebx, %r11d
        roll	$19, %r13d
        xorl	%eax, %r11d
        # iter_57: 0 - 7
        movl	228+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r15d, %ecx
        addl	%r11d, %edx
        movl	244(%rsp), %eax
        addl	%ecx, %edx
        movl	228(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r14d, %edx
        addl	%r10d, %ecx
        movl	%r8d, %r14d
        movl	%r8d, %ebx
        xorl	%r15d, %r14d
        xorl	%r9d, %ebx
        andn	%r13, %r11, %r10
        andl	%ebx, %r14d
        movl	%r11d, %ebx
        xorl	%r8d, %r14d
        andl	%r12d, %ebx
        addl	%ecx, %r14d
        orl	%ebx, %r10d
        addl	%edx, %r10d
        rorxl	$15, %r10d, %ebx
        rorxl	$23, %r10d, %eax
        roll	$9, %r8d
        xorl	%ebx, %r10d
        roll	$19, %r12d
        xorl	%eax, %r10d
        # iter_58: 0 - 7
        movl	232+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r14d, %ecx
        addl	%r10d, %edx
        movl	248(%rsp), %eax
        addl	%ecx, %edx
        movl	232(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r13d, %edx
        addl	%r9d, %ecx
        movl	%r15d, %r13d
        movl	%r15d, %ebx
        xorl	%r14d, %r13d
        xorl	%r8d, %ebx
        andn	%r12, %r10, %r9
        andl	%ebx, %r13d
        movl	%r10d, %ebx
        xorl	%r15d, %r13d
        andl	%r11d, %ebx
        addl	%ecx, %r13d
        orl	%ebx, %r9d
        addl	%edx, %r9d
        rorxl	$15, %r9d, %ebx
        rorxl	$23, %r9d, %eax
        roll	$9, %r15d
        xorl	%ebx, %r9d
        roll	$19, %r11d
        xorl	%eax, %r9d
        # iter_59: 0 - 7
        movl	236+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r13d, %ecx
        addl	%r9d, %edx
        movl	252(%rsp), %eax
        addl	%ecx, %edx
        movl	236(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r12d, %edx
        addl	%r8d, %ecx
        movl	%r14d, %r12d
        movl	%r14d, %ebx
        xorl	%r13d, %r12d
        xorl	%r15d, %ebx
        andn	%r11, %r9, %r8
        andl	%ebx, %r12d
        movl	%r9d, %ebx
        xorl	%r14d, %r12d
        andl	%r10d, %ebx
        addl	%ecx, %r12d
        orl	%ebx, %r8d
        addl	%edx, %r8d
        rorxl	$15, %r8d, %ebx
        rorxl	$23, %r8d, %eax
        roll	$9, %r14d
        xorl	%ebx, %r8d
        roll	$19, %r10d
        xorl	%eax, %r8d
        # iter_60: 0 - 7
        movl	240+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r12d, %ecx
        addl	%r8d, %edx
        movl	256(%rsp), %eax
        addl	%ecx, %edx
        movl	240(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r11d, %edx
        addl	%r15d, %ecx
        movl	%r13d, %r11d
        movl	%r13d, %ebx
        xorl	%r12d, %r11d
        xorl	%r14d, %ebx
        andn	%r10, %r8, %r15
        andl	%ebx, %r11d
        movl	%r8d, %ebx
        xorl	%r13d, %r11d
        andl	%r9d, %ebx
        addl	%ecx, %r11d
        orl	%ebx, %r15d
        addl	%edx, %r15d
        rorxl	$15, %r15d, %ebx
        rorxl	$23, %r15d, %eax
        roll	$9, %r13d
        xorl	%ebx, %r15d
        roll	$19, %r9d
        xorl	%eax, %r15d
        # iter_61: 0 - 7
        movl	244+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r11d, %ecx
        addl	%r15d, %edx
        movl	260(%rsp), %eax
        addl	%ecx, %edx
        movl	244(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r10d, %edx
        addl	%r14d, %ecx
        movl	%r12d, %r10d
        movl	%r12d, %ebx
        xorl	%r11d, %r10d
        xorl	%r13d, %ebx
        andn	%r9, %r15, %r14
        andl	%ebx, %r10d
        movl	%r15d, %ebx
        xorl	%r12d, %r10d
        andl	%r8d, %ebx
        addl	%ecx, %r10d
        orl	%ebx, %r14d
        addl	%edx, %r14d
        rorxl	$15, %r14d, %ebx
        rorxl	$23, %r14d, %eax
        roll	$9, %r12d
        xorl	%ebx, %r14d
        roll	$19, %r8d
        xorl	%eax, %r14d
        # iter_62: 0 - 7
        movl	248+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r10d, %ecx
        addl	%r14d, %edx
        movl	264(%rsp), %eax
        addl	%ecx, %edx
        movl	248(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r9d, %edx
        addl	%r13d, %ecx
        movl	%r11d, %r9d
        movl	%r11d, %ebx
        xorl	%r10d, %r9d
        xorl	%r12d, %ebx
        andn	%r8, %r14, %r13
        andl	%ebx, %r9d
        movl	%r14d, %ebx
        xorl	%r11d, %r9d
        andl	%r15d, %ebx
        addl	%ecx, %r9d
        orl	%ebx, %r13d
        addl	%edx, %r13d
        rorxl	$15, %r13d, %ebx
        rorxl	$23, %r13d, %eax
        roll	$9, %r11d
        xorl	%ebx, %r13d
        roll	$19, %r15d
        xorl	%eax, %r13d
        # iter_63: 0 - 7
        movl	252+L_SM3_AVX1_RORX_t(%rip), %edx
        rorxl	$20, %r9d, %ecx
        addl	%r13d, %edx
        movl	268(%rsp), %eax
        addl	%ecx, %edx
        movl	252(%rsp), %ebx
        roll	$7, %edx
        xorl	%edx, %ecx
        xorl	%ebx, %eax
        addl	%ebx, %edx
        addl	%eax, %ecx
        addl	%r8d, %edx
        addl	%r12d, %ecx
        movl	%r10d, %r8d
        movl	%r10d, %ebx
        xorl	%r9d, %r8d
        xorl	%r11d, %ebx
        andn	%r15, %r13, %r12
        andl	%ebx, %r8d
        movl	%r13d, %ebx
        xorl	%r10d, %r8d
        andl	%r14d, %ebx
        addl	%ecx, %r8d
        orl	%ebx, %r12d
        addl	%edx, %r12d
        rorxl	$15, %r12d, %ebx
        rorxl	$23, %r12d, %eax
        roll	$9, %r10d
        xorl	%ebx, %r12d
        roll	$19, %r14d
        xorl	%eax, %r12d
        xorl	(%rdi), %r8d
        xorl	4(%rdi), %r9d
        xorl	8(%rdi), %r10d
        xorl	12(%rdi), %r11d
        xorl	16(%rdi), %r12d
        xorl	20(%rdi), %r13d
        xorl	24(%rdi), %r14d
        xorl	28(%rdi), %r15d
        addq	$0x40, %rbp
        subl	$0x40, %esi
        movl	%r8d, (%rdi)
        movl	%r9d, 4(%rdi)
        movl	%r10d, 8(%rdi)
        movl	%r11d, 12(%rdi)
        movl	%r12d, 16(%rdi)
        movl	%r13d, 20(%rdi)
        movl	%r14d, 24(%rdi)
        movl	%r15d, 28(%rdi)
        jnz	L_SM3_AVX1_RORXlen_start
        xorq	%rax, %rax
        vzeroupper
        addq	$0x110, %rsp
        popq	%rbp
        popq	%r15
        popq	%r14
        popq	%r13
        popq	%r12
        popq	%rbx
        repz retq
#ifndef __APPLE__
.size	sm3_compress_len_avx1_rorx,.-sm3_compress_len_avx1_rorx
#endif /* __APPLE__ */
#endif /* HAVE_INTEL_AVX1 */
#endif /* WOLFSSL_X86_64_BUILD */
#endif /* WOLFSSL_SM3 */

#if defined(__linux__) && defined(__ELF__)
.section	.note.GNU-stack,"",%progbits
#endif
